From 21b7aacef7b8d5fc4805637c8b95572eb6c70c73 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Thu, 2 May 2024 20:43:57 +0900 Subject: [PATCH 01/14] [ Weight ] Add Var32 Tensor in Weight. We will add Var32 Tensor if the Variable Weight is not Full precision (FP32). This eables the Weight Update with full precision and only Apply Gradient Process ueses this Tensor. Therefore, the lifespan of this tensor should be "ApplyGradient". . Modify TensorPool to generate Weigth considering Mixed Precsion. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/network_graph.cpp | 13 ++++- nntrainer/layers/layer_node.h | 1 + nntrainer/tensor/manager.cpp | 31 ++++++++++-- nntrainer/tensor/manager.h | 2 +- nntrainer/tensor/weight.cpp | 83 +++++++++++++++++++++++++++++++ nntrainer/tensor/weight.h | 50 ++++++++++++------- 6 files changed, 154 insertions(+), 26 deletions(-) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index 2d4cfdc769..370c18964f 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -1557,7 +1557,18 @@ void NetworkGraph::requestOptimizerVariable( std::vector dims = cb(dim); w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables( dims, w->getName(), TensorLifespan::MAX_LIFESPAN, - w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS)); + w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), + Tensor::Initializer::ZEROS)); + + if (dim.getDataType() != ml::train::TensorDim::DataType::FP32) { + for (auto &dim : dims) + dim.setDataType(ml::train::TensorDim::DataType::FP32); + w->setOptimizerVariables32( + tensor_manager->requestWeightOptimizerVariables( + dims, w->getName(), TensorLifespan::MAX_LIFESPAN, + w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), + Tensor::Initializer::ZEROS)); + } } } } diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index 93e7ac7069..f8e5b7c4e9 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode { const std::vector getOutputDimensions() const; /** * @brief Get the Weight object + * currently, only unittest uses this func. * * @param idx Identifier of the weight * @return Weight& Reference to the weight diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp index 9a0d235ba9..572bd217cf 100644 --- a/nntrainer/tensor/manager.cpp +++ b/nntrainer/tensor/manager.cpp @@ -414,7 +414,7 @@ std::vector Manager::requestWeights( // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER); } - Tensor *var = nullptr, *grad = nullptr; + Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr; bool is_dependent = !shared_names.empty(); if (is_dependent) { /// shared_name is used and the orignal name is discarded @@ -431,6 +431,17 @@ std::vector Manager::requestWeights( grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix, dim_g, grad_exec_order, grad_ls, Tensor::Initializer::ZEROS); + + if (var->getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::vector var32_exec_order; + var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER); + + var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim, + var32_exec_order, var_ls, + Tensor::Initializer::ZEROS); + } } } else { /** case requesting fresh weights */ @@ -448,11 +459,21 @@ std::vector Manager::requestWeights( grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g, grad_exec_order, grad_ls, Tensor::Initializer::ZEROS, is_wgrad); + if (var->getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::vector var32_exec_order; + var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER); + var32 = + weight_pool.request(name + ":var32", var32_dim, var32_exec_order, + var_ls, Tensor::Initializer::ZEROS); + } } } - weights_v2.emplace_back(std::make_unique( - var, grad, w_reg, w_reg_const, decay, is_dependent, clip_by_global_norm)); + weights_v2.emplace_back( + std::make_unique(var, grad, var32, w_reg, w_reg_const, decay, + is_dependent, clip_by_global_norm)); } std::transform(weights_v2.begin() + current_size, weights_v2.end(), @@ -668,7 +689,7 @@ bool Manager::isSecondLastAccess(const std::string &name, */ std::vector Manager::requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, + const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_precision, Tensor::Initializer initializer) { std::vector ret; @@ -676,7 +697,7 @@ std::vector Manager::requestWeightOptimizerVariables( std::vector exec; exec.reserve(1); - if (is_grad_clip) { + if (is_grad_clip || is_mixed_precision) { exec.emplace_back(TensorPool::PERSIST_END_ORDER); } else { exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second); diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h index ab1c018153..1fa810a35c 100644 --- a/nntrainer/tensor/manager.h +++ b/nntrainer/tensor/manager.h @@ -224,7 +224,7 @@ class Manager { */ std::vector requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, + const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_type, Tensor::Initializer initializer = Tensor::Initializer::NONE); /** diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index f98c8c8356..f86c327842 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -34,6 +34,28 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init, throw std::invalid_argument("Weight initializer cannot be none"); if (regularizer == WeightRegularizer::UNKNOWN) throw std::invalid_argument("Weight regularizer unknown"); + + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + /** + * @note We assume if the Weight Data Type is not FP32, then FP32 Weight is + * necessary to maintain the accuracy. + * We could think it can be other data type and if there is the case to + * support other data type, then the code below needs to be udpated. + * + * Also, the loss_scale is not used in Weight but leave as it is for later + * usage. + */ + + if (train && dim.getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + + var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name); + } else { + var32 = std::make_shared(var32_name); + } } Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g, @@ -52,6 +74,67 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g, throw std::invalid_argument("Weight initializer cannot be none"); if (regularizer == WeightRegularizer::UNKNOWN) throw std::invalid_argument("Weight regularizer unknown"); + + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + if (train && dim_v.getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name); + } else { + var32 = std::make_shared(var32_name); + } +} + +Weight::Weight(const Tensor &v, const Tensor &g, const std::string &n, + bool is_dependent, unsigned int output_axis_) : + Var_Grad(v, g, n, is_dependent), + regularizer(WeightRegularizer::NONE), + regularizer_constant(1.0f), + decay(0.0f), + clip_by_global_norm(0.0f), + output_axis(output_axis_), + loss_scale(0.0) { + + std::string var32_suffix = ":fp32"; + std::string var32_name = n + var32_suffix; + + /** + * @note We assume here that Weight is created with variable and gradient + * tensor. It is not copy or clone and, therefore, we do need create var32 if + * it is trainable. For now, We haven't seen the case create wieght with var, + * grad and var32. But we will add weight constructor if there is the cases. + */ + + if (!g.empty() && v.getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(v.getDim()); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + + var32 = std::make_shared(var32_dim, true, Tensor::Initializer::NONE, + var32_name); + } else { + var32 = std::make_shared(var32_name); + } +} + +Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg, + const float reg_const, const float decay, bool is_dependent, + const float max_norm, unsigned int output_axis_, + float loss_scale_) : + Var_Grad(v, g, is_dependent), + regularizer(reg), + regularizer_constant(reg_const), + decay(decay), + clip_by_global_norm(max_norm), + output_axis(output_axis_), + loss_scale(loss_scale_), + var32(std::shared_ptr(v32, [](void *) {})) { + if (!v32) + var32 = std::make_shared(); } } // namespace nntrainer diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 552f6d5739..3a81d1f58e 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -124,34 +124,22 @@ class Weight : public Var_Grad { * if the owner of these tensors free the tensors. */ explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "", - bool is_dependent = false, unsigned int output_axis_ = 3) : - Var_Grad(v, g, n, is_dependent), - regularizer(WeightRegularizer::NONE), - regularizer_constant(1.0f), - decay(0.0f), - clip_by_global_norm(0.0f), - output_axis(output_axis_), - loss_scale(0.0) {} + bool is_dependent = false, unsigned int output_axis_ = 3); /** * @brief Construct a new Weight object * * @param v ptr to already created variable tensor * @param g ptr to already created gradient tensor + * @param v32 ptr to already created variable32 tensor * @param reg Regularizer for the weight * @param reg_const Constant multiplier for regularizer */ - explicit Weight(Tensor *v, Tensor *g, const WeightRegularizer reg, - const float reg_const, const float decay, - bool is_dependent = false, const float max_norm = 0.0f, - unsigned int output_axis_ = 3, float loss_scale_ = 0.0f) : - Var_Grad(v, g, is_dependent), - regularizer(reg), - regularizer_constant(reg_const), - decay(decay), - clip_by_global_norm(max_norm), - output_axis(output_axis_), - loss_scale(loss_scale_) {} + explicit Weight(Tensor *v, Tensor *g, Tensor *v32, + const WeightRegularizer reg, const float reg_const, + const float decay, bool is_dependent = false, + const float max_norm = 0.0f, unsigned int output_axis_ = 3, + float loss_scale_ = 0.0f); /** * @brief Swap for weight @@ -170,6 +158,7 @@ class Weight : public Var_Grad { swap(lhs.output_axis, rhs.output_axis); swap(lhs.opt_vars, rhs.opt_vars); swap(lhs.loss_scale, rhs.loss_scale); + swap(lhs.var32, rhs.var32); } /** @@ -213,6 +202,8 @@ class Weight : public Var_Grad { w.var = std::make_shared(this->var->clone()); if (!this->grad->empty()) w.grad = std::make_shared(this->grad->clone()); + if (!this->var32->empty()) + w.var32 = std::make_shared(this->var32->clone()); return w; } @@ -230,6 +221,16 @@ class Weight : public Var_Grad { opt_vars = tensors; } + /** + * @brief Add optimizer variables32 + * We assume if the datatype of weight is not FP32, then it needs to set + * OptmizerVarialbe32 to maintain acccuracy. + * @param tensors OptimizerVariable32 Tensor list + */ + void setOptimizerVariables32(std::vector tensors) { + opt_vars32 = tensors; + } + /** * @brief Get optimizer variable reference * @param idx Index of the optimizer variable to get @@ -316,6 +317,16 @@ class Weight : public Var_Grad { return clip_by_global_norm > epsilon; } + /** + * @brief Check if the variable type is not full precision + * + * @return true if it is not full precsion + * @return false otherwise + */ + bool isMixedPrecision() const { + return var->getDataType() == ml::train::TensorDim::DataType::FP32; + } + /** * @brief clip the gradient value based on the given global norm * @@ -338,6 +349,7 @@ class Weight : public Var_Grad { unsigned int output_axis; float loss_scale; std::vector opt_vars; /**< optimizer variables */ + std::vector opt_vars32; std::shared_ptr var32; /** From 0d899d57498003b4e92f452bd168c85426c21c3e Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Tue, 7 May 2024 13:24:00 +0900 Subject: [PATCH 02/14] [ Mixed ] Create weight with var32 tensor This pr create the variable fp32 tensor when we create the Weight and Optimizer Weight. . update the manager to create Weight with var32 tensor which requested to weight pool. . update the weight requests with Weight Spec and var, grad and var32 tensors which created already. . add clone Tensor with specific type in tensor.h Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/network_graph.cpp | 7 ++++--- nntrainer/layers/layer_context.cpp | 13 +++++++++++++ nntrainer/layers/layer_context.h | 9 +++++++++ nntrainer/layers/layer_node.h | 8 ++++---- nntrainer/tensor/manager.cpp | 6 +++--- nntrainer/tensor/manager.h | 3 ++- nntrainer/tensor/tensor.cpp | 12 ++++++++++++ nntrainer/tensor/tensor.h | 7 +++++++ nntrainer/tensor/weight.cpp | 28 +++++++++------------------- nntrainer/tensor/weight.h | 15 ++++++++++++--- 10 files changed, 75 insertions(+), 33 deletions(-) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index 370c18964f..c0ee126c93 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -768,6 +768,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode, * node is going to be used with in-place optimizations. */ auto out_specs = init_context.getOutSpecs(); + /// @note try move inplace control to finalize bool shared_var = false, shared_grad = false; if (lnode->executeInPlace() != InPlace::NONE) { @@ -1556,16 +1557,16 @@ void NetworkGraph::requestOptimizerVariable( const TensorDim &dim = w->getDim(); std::vector dims = cb(dim); w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables( - dims, w->getName(), TensorLifespan::MAX_LIFESPAN, + dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN, w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), Tensor::Initializer::ZEROS)); - if (dim.getDataType() != ml::train::TensorDim::DataType::FP32) { + if (w->isMixedPrecision()) { for (auto &dim : dims) dim.setDataType(ml::train::TensorDim::DataType::FP32); w->setOptimizerVariables32( tensor_manager->requestWeightOptimizerVariables( - dims, w->getName(), TensorLifespan::MAX_LIFESPAN, + dims, w->getName(), ":opt32:", TensorLifespan::MAX_LIFESPAN, w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), Tensor::Initializer::ZEROS)); } diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index fff2eb15ec..f0856c1dbb 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -169,6 +169,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const { return weights[idx]->getGradientRef(); } +/** + * @brief Get the Weight Gradient tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight grad tensor + */ +Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const { + if (!weights[idx]->hasGradient()) + throw std::invalid_argument( + "Requesting gradient for a non-trainable weight."); + return weights[idx]->getVariableFP32Ref(); +} + /** * @brief Get the Weight Optimizer Variable tensor object * diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index e5c6759638..e2f428aa2c 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -463,6 +463,15 @@ class RunLayerContext { Tensor &getWeightGrad(unsigned int idx) const; /** + * @brief Get the Weight Gradient tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight grad tensor + */ + Tensor &getWeightFP32(unsigned int idx) const; + + /** + * @brief Get the Weight Optimizer Variable tensor object * * @param idx Identifier of the weight diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index f8e5b7c4e9..7dfb1bd1a0 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -496,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode { NNTR_THROW_IF(!run_context, std::runtime_error) << __func__ << " layer needs to be finalized first!"; if (run_context->weightHasGradient(idx)) { - return Weight(run_context->getWeight(idx), - run_context->getWeightGrad(idx), - run_context->getWeightName(idx)); + return Weight( + run_context->getWeight(idx), run_context->getWeightGrad(idx), + run_context->getWeightFP32(idx), run_context->getWeightName(idx)); } else { - return Weight(run_context->getWeight(idx), Tensor(), + return Weight(run_context->getWeight(idx), Tensor(), Tensor(), run_context->getWeightName(idx)); } } diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp index 572bd217cf..b4ac106b12 100644 --- a/nntrainer/tensor/manager.cpp +++ b/nntrainer/tensor/manager.cpp @@ -689,8 +689,8 @@ bool Manager::isSecondLastAccess(const std::string &name, */ std::vector Manager::requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_precision, - Tensor::Initializer initializer) { + const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip, + bool is_mixed_precision, Tensor::Initializer initializer) { std::vector ret; ret.reserve(dims.size()); @@ -706,7 +706,7 @@ std::vector Manager::requestWeightOptimizerVariables( /// @note this is assuming weight optimizer variables is treated as weight, if /// not, there is room to optimize below behavior for (unsigned int idx = 0; idx < dims.size(); idx++) - ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx), + ret.push_back(weight_pool.request(name + suffix + std::to_string(idx), dims[idx], exec, lifespan, initializer)); return ret; diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h index 1fa810a35c..80ffb9d21d 100644 --- a/nntrainer/tensor/manager.h +++ b/nntrainer/tensor/manager.h @@ -224,7 +224,8 @@ class Manager { */ std::vector requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_type, + const std::string &suffix, const TensorLifespan &lifespan, + bool is_grad_clip, bool is_mixed_type, Tensor::Initializer initializer = Tensor::Initializer::NONE); /** diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 4f1e8e0721..b14bbd7ae4 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -3065,6 +3065,18 @@ Tensor Tensor::clone() const { return t; } +Tensor Tensor::clone(ml::train::TensorDim::DataType type) const { + if (getDataType() == type) + return clone(); + + TensorDim dim = getDim(); + dim.setDataType(type); + Tensor t(dim, true); + t.copyData(*this); + t.name = name; + return t; +} + void Tensor::reshape(const TensorDim &d) { NNTR_THROW_IF(!contiguous, std::invalid_argument) diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 211334da40..2ea0393e66 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -1680,6 +1680,13 @@ class Tensor { */ Tensor clone() const; + /** + * @brief Convient wrapper for inplace copy of @a this. + * @param[in] type output tensor data type + * @retval Copied version of this + */ + Tensor clone(ml::train::TensorDim::DataType type) const; + /** * @brief Save the Tensor into file * @param[in] file output file stream diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index f86c327842..904948491f 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -90,34 +90,24 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g, } } -Weight::Weight(const Tensor &v, const Tensor &g, const std::string &n, - bool is_dependent, unsigned int output_axis_) : +Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32, + const std::string &n, bool is_dependent, + unsigned int output_axis_) : Var_Grad(v, g, n, is_dependent), regularizer(WeightRegularizer::NONE), regularizer_constant(1.0f), decay(0.0f), clip_by_global_norm(0.0f), output_axis(output_axis_), - loss_scale(0.0) { + loss_scale(0.0), + var32(std::make_shared(n + ":fp32")) { - std::string var32_suffix = ":fp32"; - std::string var32_name = n + var32_suffix; - - /** - * @note We assume here that Weight is created with variable and gradient - * tensor. It is not copy or clone and, therefore, we do need create var32 if - * it is trainable. For now, We haven't seen the case create wieght with var, - * grad and var32. But we will add weight constructor if there is the cases. - */ - - if (!g.empty() && v.getDataType() != ml::train::TensorDim::DataType::FP32) { + if (!g.empty() && isMixedPrecision()) { TensorDim var32_dim(v.getDim()); var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); - - var32 = std::make_shared(var32_dim, true, Tensor::Initializer::NONE, - var32_name); - } else { - var32 = std::make_shared(var32_name); + if (!v32.empty()) + var32 = std::make_shared( + v32.getSharedDataTensor(var32_dim, 0, false, n + ":fp32")); } } diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 3a81d1f58e..1d69d7c72d 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -114,6 +114,7 @@ class Weight : public Var_Grad { * * @param v Already created variable object * @param g Already created gradient object + * @param v32 Already created gradient object * @param n Name for this Weight * * @note This is primarily used to created wrapper of variable extracted from @@ -123,8 +124,9 @@ class Weight : public Var_Grad { * uses only, as Weight does not own the tensors v and g, and can go invalid * if the owner of these tensors free the tensors. */ - explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "", - bool is_dependent = false, unsigned int output_axis_ = 3); + explicit Weight(const Tensor &v, const Tensor &g, const Tensor &v32, + const std::string &n = "", bool is_dependent = false, + unsigned int output_axis_ = 3); /** * @brief Construct a new Weight object @@ -324,7 +326,7 @@ class Weight : public Var_Grad { * @return false otherwise */ bool isMixedPrecision() const { - return var->getDataType() == ml::train::TensorDim::DataType::FP32; + return var->getDataType() != ml::train::TensorDim::DataType::FP32; } /** @@ -337,6 +339,13 @@ class Weight : public Var_Grad { grad->multiply_i(clip_by_global_norm / (global_norm + epsilon)); } + /** + * @brief Get the variable FP32 tensor (by reference) + * + * @return Tensor Variable FP32 tensor + */ + Tensor &getVariableFP32Ref() { return *var32.get(); } + private: static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */ static constexpr float epsilon_decay = From 6b71b42ca1d42de4f0825729a8953da51cdcc0f0 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Tue, 7 May 2024 13:24:49 +0900 Subject: [PATCH 03/14] [ Layers ] Update Layers to support FP16 This PR enables the FP16 support for the layers below: . input layer . mse loss layer Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/layers/input_layer.cpp | 19 ++++++++++++++++--- nntrainer/layers/input_layer.h | 3 ++- nntrainer/layers/loss/mse_loss_layer.cpp | 11 ++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/nntrainer/layers/input_layer.cpp b/nntrainer/layers/input_layer.cpp index eabd40b297..d9f058d8ce 100644 --- a/nntrainer/layers/input_layer.cpp +++ b/nntrainer/layers/input_layer.cpp @@ -33,8 +33,7 @@ namespace nntrainer { static constexpr size_t SINGLE_INOUT_IDX = 0; InputLayer::InputLayer() : - Layer(), - input_props(props::Normalization(), props::Standardization()) {} + Layer(), input_props(props::Normalization(), props::Standardization()) {} void InputLayer::setProperty(const std::vector &values) { auto remain_props = loadProperties(values, input_props); @@ -47,7 +46,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) { Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); if (!context.executeInPlace()) { Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); - hidden_.copy(input_); + hidden_.copyData(input_); } if (std::get(input_props)) @@ -70,7 +69,21 @@ void InputLayer::finalize(InitLayerContext &context) { std::vector output_dims = context.getInputDimensions(); + for (auto &d : output_dims) { + d.setDataType(context.getActivationDataType()); + } + context.setOutputDimensions(output_dims); + + is_inplace = true; + + /** + * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the + * activation data type is not fp32, then it does not support in-place + * operation. + */ + if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32) + is_inplace = false; } } /* namespace nntrainer */ diff --git a/nntrainer/layers/input_layer.h b/nntrainer/layers/input_layer.h index f6728d676b..e9183e23d1 100644 --- a/nntrainer/layers/input_layer.h +++ b/nntrainer/layers/input_layer.h @@ -82,7 +82,7 @@ class InputLayer : public Layer { /** * @copydoc Layer::supportInPlace() */ - bool supportInPlace() const override { return true; } + bool supportInPlace() const override { return is_inplace; } /** * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods @@ -105,6 +105,7 @@ class InputLayer : public Layer { private: std::tuple input_props; + bool is_inplace; }; } // namespace nntrainer diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index 7f7bd1626f..ec9bc9b844 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0; void MSELossLayer::forwarding(RunLayerContext &context, bool training) { Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); - Tensor &y = context.getInput(SINGLE_INOUT_IDX); + + Tensor empty_tensor; + Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getInput(SINGLE_INOUT_IDX) + : empty_tensor; + + if (y.empty()) + y = context.getInput(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); // hidden_ <- y2 - y; if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { From 544de98e10ae11b0a667588d679d38694b5a9ebc Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Tue, 7 May 2024 13:26:42 +0900 Subject: [PATCH 04/14] [ Test ] Mixed Precision Test Case This PR includes the mixed precision test case. . Input - FC - MSE : "batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128" **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- Applications/KNN/jni/meson.build | 2 +- nntrainer/graph/network_graph.cpp | 2 +- test/unittest/models/meson.build | 4 ++ .../unittest_models_mixed_precision.cpp | 54 +++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 test/unittest/models/unittest_models_mixed_precision.cpp diff --git a/Applications/KNN/jni/meson.build b/Applications/KNN/jni/meson.build index bc50dc0214..58ca099d75 100644 --- a/Applications/KNN/jni/meson.build +++ b/Applications/KNN/jni/meson.build @@ -15,4 +15,4 @@ e = executable('knn_sample', install_dir: application_install_dir ) -test('app_knn', e, args: [nntr_app_resdir / 'KNN']) +test('app_knn', e, args: [nntr_app_resdir / 'KNN/']) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index c0ee126c93..97e20f6a24 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -771,7 +771,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode, /// @note try move inplace control to finalize bool shared_var = false, shared_grad = false; - if (lnode->executeInPlace() != InPlace::NONE) { + if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) { setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad); for (unsigned int i = 0; i < out_specs.size(); ++i) { auto &s = out_specs.at(i); diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build index 7166fc41ff..4a6e81e65d 100644 --- a/test/unittest/models/meson.build +++ b/test/unittest/models/meson.build @@ -11,6 +11,10 @@ models_targets = [ # disable temperally ] +if get_option('enable-fp16') + models_targets += 'unittest_models_mixed_precision.cpp' +endif + test_target += models_targets exe = executable( test_name, diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp new file mode 100644 index 0000000000..becf11ff44 --- /dev/null +++ b/test/unittest/models/unittest_models_mixed_precision.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Jijoong Moon + * + * @file unittest_models_mixed_precision.cpp + * @date 3 May 2024 + * @brief unittest models to cover mixed precision + * @see https://github.com/nnstreamer/nntrainer + * @author Jijoong Moon + * @bug No known bugs except for NYI items + */ + +#include + +#include + +#include +#include +#include + +#include + +using namespace nntrainer; + +static std::unique_ptr fc_mixed_training() { + std::unique_ptr nn(new NeuralNetwork()); + nn->setProperty( + {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"}); + + auto graph = makeGraph({ + {"input", {"name=in", "input_shape=1:1:3"}}, + {"Fully_connected", {"name=fc", "input_layers=in", "unit=10"}}, + {"mse", {"name=loss", "input_layers=fc"}}, + }); + for (auto &node : graph) { + nn->addLayer(node); + } + + nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate = 0.1"})); + + return nn; +} + +GTEST_PARAMETER_TEST( + MixedPrecision, nntrainerModelTest, + ::testing::ValuesIn({ + mkModelTc_V2(fc_mixed_training, "fc_mixed_training", + ModelTestOption::NO_THROW_RUN_V2), + /** ModelTestOption::ALL_V2), + * Disabled for now to check + */ + }), + [](const testing::TestParamInfo &info) + -> const auto & { return std::get<1>(info.param); }); From 26ea6af4430f7bae48d37c9ce9490cbaa81b3ec6 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Thu, 9 May 2024 14:52:07 +0900 Subject: [PATCH 05/14] [ Optimizer ] Update Optimizer / Adam to support Mixed training This commit modify apply gradient in optimizer. We do not need to save optimizer variables in weight type. Only Optimizer needs the optimizer variables and we should update the weight with full precision to maintain the accuracy. Therefore, remove the var32 tensors for optimizer variables. Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/network_graph.cpp | 10 ------ nntrainer/optimizers/adam.cpp | 27 +++++++++++++--- nntrainer/optimizers/optimizer_context.cpp | 7 ++++ nntrainer/optimizers/optimizer_context.h | 14 ++++++-- nntrainer/tensor/manager.cpp | 6 ++-- nntrainer/tensor/weight.cpp | 37 ++++++++++++++++++++++ nntrainer/tensor/weight.h | 35 ++++++++++---------- 7 files changed, 99 insertions(+), 37 deletions(-) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index 97e20f6a24..b7f4d1cffd 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -1560,16 +1560,6 @@ void NetworkGraph::requestOptimizerVariable( dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN, w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), Tensor::Initializer::ZEROS)); - - if (w->isMixedPrecision()) { - for (auto &dim : dims) - dim.setDataType(ml::train::TensorDim::DataType::FP32); - w->setOptimizerVariables32( - tensor_manager->requestWeightOptimizerVariables( - dims, w->getName(), ":opt32:", TensorLifespan::MAX_LIFESPAN, - w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), - Tensor::Initializer::ZEROS)); - } } } } diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp index 18c0a0fcc1..530e7fdf31 100644 --- a/nntrainer/optimizers/adam.cpp +++ b/nntrainer/optimizers/adam.cpp @@ -36,7 +36,15 @@ Adam::~Adam() {} enum AdamParams { wm, wv }; std::vector Adam::getOptimizerVariableDim(const TensorDim &dim) { - return {dim, dim}; + /** + * @note We assume the optimizer parameters should be full precsion to + * maintain the accuracy even in mixed precision training. + */ + TensorDim wm_dim(dim); + TensorDim wv_dim(dim); + wm_dim.setDataType(ml::train::TensorDim::DataType::FP32); + wv_dim.setDataType(ml::train::TensorDim::DataType::FP32); + return {wm_dim, wv_dim}; } void Adam::exportTo(Exporter &exporter, @@ -64,7 +72,15 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const { } void Adam::applyGradient(RunOptimizerContext &context) { - Tensor &x_grad = context.getGradient(); + Tensor empty_tensor; + + Tensor &x_grad = + context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32 + ? context.getGradient() + : empty_tensor; + + if (x_grad.empty()) + x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32); auto &beta1 = std::get(adam_props).get(); auto &beta2 = std::get(adam_props).get(); @@ -91,7 +107,7 @@ void Adam::applyGradient(RunOptimizerContext &context) { denom.add_i(epsilon); wm.divide(denom, x_grad); - context.applyGradient(context.getLearningRate() / biasCorrection1); + context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad); } else { std::function sqrtEps = [epsilon](double f) { @@ -100,8 +116,9 @@ void Adam::applyGradient(RunOptimizerContext &context) { x_grad = wv.apply(sqrtEps, x_grad); x_grad.multiply_i(wm); - context.applyGradient(getUpdatedLearningRate(context.getIteration(), - context.getLearningRate())); + context.applyGradient( + getUpdatedLearningRate(context.getIteration(), context.getLearningRate()), + x_grad); } } diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp index da4cd1f7e9..f70ab773a9 100644 --- a/nntrainer/optimizers/optimizer_context.cpp +++ b/nntrainer/optimizers/optimizer_context.cpp @@ -42,4 +42,11 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const { void RunOptimizerContext::applyGradient(double lr) const { weight->applyGradient(lr); } + +/** + * @brief Apply the gradient with the given learning rate and gradient + */ +void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const { + weight->applyGradient(lr, updated_grad); +} } // namespace nntrainer diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h index 62f9e0945d..6b4b983e35 100644 --- a/nntrainer/optimizers/optimizer_context.h +++ b/nntrainer/optimizers/optimizer_context.h @@ -35,9 +35,7 @@ class RunOptimizerContext { * */ RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) : - weight(w), - iteration(iter), - learning_rate(lr) {} + weight(w), iteration(iter), learning_rate(lr) {} /** * @brief Get the Weight tensor object @@ -75,6 +73,16 @@ class RunOptimizerContext { */ void applyGradient(double lr) const; + /** + * @brief Apply the gradient with the given learning rate and updated + * gradient + * + * @param lr learning rate + * @param updated_grad gradient tensor which is updated. (usually it could be + * fp32) + */ + void applyGradient(double lr, Tensor &updated_grad) const; + /** * @brief Get the current iteration value * diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp index b4ac106b12..14d710b3c0 100644 --- a/nntrainer/tensor/manager.cpp +++ b/nntrainer/tensor/manager.cpp @@ -471,9 +471,9 @@ std::vector Manager::requestWeights( } } - weights_v2.emplace_back( - std::make_unique(var, grad, var32, w_reg, w_reg_const, decay, - is_dependent, clip_by_global_norm)); + weights_v2.emplace_back(std::make_unique( + var, grad, var32, w_reg, w_reg_const, decay, is_dependent, + clip_by_global_norm, axis, loss_scale)); } std::transform(weights_v2.begin() + current_size, weights_v2.end(), diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index 904948491f..d8db5ba094 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -127,4 +127,41 @@ Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg, var32 = std::make_shared(); } +void Weight::applyGradient(double lr, Tensor &updated_grad) { + if (isMixedPrecision() && + updated_grad.getDataType() == ml::train::TensorDim::DataType::FP32) { + updated_grad.divide(loss_scale); + var32->add_i(updated_grad, -lr); + quantizeWeight(); + return; + } + + return applyGradient(lr); +} + +void Weight::quantizeWeight() { + if (!isMixedPrecision()) + return; + + Tensor &var = getVariableRef(); + ml::train::TensorDim::DataType type = var.getDataType(); + switch (type) { + case ml::train::TensorDim::DataType::QINT4: + // NYI + break; + case ml::train::TensorDim::DataType::QINT8: + // NYI + break; + case ml::train::TensorDim::DataType::FP16: + getVariableRef().copy(getVariableFP32Ref()); + break; + case ml::train::TensorDim::DataType::FP32: + break; + default: + break; + } + + return; +} + } // namespace nntrainer diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 1d69d7c72d..5382c686e1 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -46,7 +46,7 @@ class Weight : public Var_Grad { decay(0.0f), clip_by_global_norm(0.0f), output_axis(3), - loss_scale(0.0) {} + loss_scale(1.0) {} /** * @brief Construct a new Weight object @@ -66,7 +66,7 @@ class Weight : public Var_Grad { const float reg_const = 1.0f, const float decay = 0.0f, const float clip_by_global_norm = 0.0f, bool ng = true, bool alloc_now = false, std::string name = "", unsigned int axis = 3, - float loss_scale_ = 0.0); + float loss_scale_ = 1.0); /** * @brief Construct a new Weight object @@ -87,7 +87,7 @@ class Weight : public Var_Grad { const float reg_const = 1.0f, const float decay = 0.0f, const float clip_by_global_norm = 0.0f, bool ng = true, bool alloc_now = false, std::string name = "", unsigned int axis = 3, - float loss_scale_ = 0.0); + float loss_scale_ = 1.0); /** * @brief Construct a new Weight object @@ -141,7 +141,7 @@ class Weight : public Var_Grad { const WeightRegularizer reg, const float reg_const, const float decay, bool is_dependent = false, const float max_norm = 0.0f, unsigned int output_axis_ = 3, - float loss_scale_ = 0.0f); + float loss_scale_ = 1.0f); /** * @brief Swap for weight @@ -223,16 +223,6 @@ class Weight : public Var_Grad { opt_vars = tensors; } - /** - * @brief Add optimizer variables32 - * We assume if the datatype of weight is not FP32, then it needs to set - * OptmizerVarialbe32 to maintain acccuracy. - * @param tensors OptimizerVariable32 Tensor list - */ - void setOptimizerVariables32(std::vector tensors) { - opt_vars32 = tensors; - } - /** * @brief Get optimizer variable reference * @param idx Index of the optimizer variable to get @@ -297,6 +287,13 @@ class Weight : public Var_Grad { */ void applyGradient(double lr) { var->add_i(*grad.get(), -lr); } + /** + * @brief Apply the gradient to the weight with updated gradient + * @param[in] updated_grad gradient tensor which is updated in optimizer + * it might be different data type with gradient in weight. .eg : FP32 + */ + void applyGradient(double lr, Tensor &updated_grad); + /** * @brief Check if the gradient is supposed to be clipped by global norm with * the given max_norm value @@ -346,6 +343,12 @@ class Weight : public Var_Grad { */ Tensor &getVariableFP32Ref() { return *var32.get(); } + /** + * @brief Quantize var32 to var + * + */ + void quantizeWeight(); + private: static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */ static constexpr float epsilon_decay = @@ -357,8 +360,8 @@ class Weight : public Var_Grad { float clip_by_global_norm; /**< constant factor to clip gradient by L2 norm */ unsigned int output_axis; float loss_scale; - std::vector opt_vars; /**< optimizer variables */ - std::vector opt_vars32; + std::vector + opt_vars; /**< optimizer variables : We assume it is always full-precsion*/ std::shared_ptr var32; /** From 738ee93089f762ce49836ef83786c5d979f1c057 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Wed, 8 May 2024 19:04:18 +0900 Subject: [PATCH 06/14] [ Tensor ] add is_NaN check in Tensor This PR add is_NaN function to check if the tensor has NaN value. This is for the check NaN during mixed precision training. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- meson.build | 17 +++-- nntrainer/tensor/blas_avx.cpp | 96 +++++++++++++++++++++++++++++ nntrainer/tensor/blas_avx.h | 20 ++++++ nntrainer/tensor/blas_interface.cpp | 36 +++++++++++ nntrainer/tensor/blas_interface.h | 10 +++ nntrainer/tensor/blas_neon.cpp | 39 ++++++++++++ nntrainer/tensor/blas_neon.h | 18 ++++++ nntrainer/tensor/meson.build | 9 ++- nntrainer/tensor/tensor.cpp | 12 ++++ nntrainer/tensor/tensor.h | 6 ++ packaging/nntrainer.spec | 13 +++- test/unittest/models/meson.build | 23 ++++++- 12 files changed, 288 insertions(+), 11 deletions(-) diff --git a/meson.build b/meson.build index d4aea330a4..7ae692e6d9 100644 --- a/meson.build +++ b/meson.build @@ -64,9 +64,19 @@ warning_c_flags = [ '-Wno-error=varargs' ] +arch = host_machine.cpu_family() + +if get_option('enable-avx') + extra_defines += '-DUSE_AVX=1' + if get_option('platform') == 'tizen' + add_project_arguments(['-mavx2'], language: ['c','cpp']) + else + add_project_arguments(['-march=native'], language: ['c','cpp']) + endif + message('-march=native added for AVX hardware acceleration.') +endif if get_option('enable-fp16') - arch = host_machine.cpu_family() if get_option('platform') == 'android' add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp']) extra_defines += '-DENABLE_FP16=1' @@ -105,11 +115,6 @@ if get_option('enable-fp16') if cc.version().version_compare('>=12.1.0') message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.') extra_defines += '-DENABLE_FP16=1' - if get_option('enable-avx') - extra_defines += '-DUSE_AVX=1' - add_project_arguments(['-march=native'], language: ['c','cpp']) - message('-march=native added for AVX hardware acceleration.') - endif else warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') endif diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index ce59583d6f..2fd4908463 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 void vcvt_f16_f32(size_t N, const void *input, float *output) { assert(N != 0); assert(input != NULL); @@ -114,4 +115,99 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } +bool hasNaN(const size_t N, const _Float16 *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + const __m256 vec1 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8)); + + input += 16; + + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return true; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + } + + // remain check : ( X != X ) + while (idx < N) { + if (*input != *input) { + return true; + } + ++input; + } + + return false; +} +#endif + +bool hasNaN(const size_t N, const float *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = _mm256_loadu_ps(input); + const __m256 vec1 = _mm256_loadu_ps(input + 8); + input += 16; + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return true; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_loadu_ps(input); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + } + + // remain check : ( X != X ) + while (idx < N) { + if (*input != *input) { + return true; + } + ++input; + } + + return false; +} + } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h index ab1270a208..d25ded103f 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/blas_avx.h @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 /** * @brief Converts half-precision floating point values to single-precision * floating point values. @@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output); */ void vcvt_f32_f16(size_t N, const float *input, void *output); +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X half-precision * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const _Float16 *X); +#endif + +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const float *X); + } // namespace nntrainer::avx #endif /* __cplusplus */ diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 9be6fb9911..78cb708e53 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -1038,6 +1038,16 @@ static void ele_div_fallback(const unsigned int N, const float *X, } } +static bool has_nan_fallback(const size_t N, const float *X) { + for (size_t i = 0; i < N; ++i) { + if (*X != *X) + return true; + ++X; + } + + return false; +} + void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta, unsigned int i_stride, unsigned int o_stride) { @@ -1090,4 +1100,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); } +bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X) { + if (d_type == ml::train::TensorDim::DataType::FP16) { +#ifdef ENABLE_FP16 + const _FP16 *vec = (const _FP16 *)X; +#ifdef USE_NEON + return nntrainer::neon::hasNaN(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::hasNaN(N, vec); +#else + throw std::invalid_argument("Error: enable-fp16 is not enabled"); +#endif +#endif + } else if (d_type == ml::train::TensorDim::DataType::FP32) { + const float *vec = (const float *)X; +#ifdef USE_NEON + return nntrainer::neon::hasNaN(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::hasNaN(N, vec); +#endif + + return has_nan_fallback(N, vec); + } + return false; +} + } // namespace nntrainer diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 04a8a23018..bcd557111e 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief check if X array has NaN + * @param[in] N length of the vector + * @param[in] X float/fp16 * for Vector X + * @param[out] bool true if NaN else false + */ +bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X); + } /* namespace nntrainer */ #endif /* __cplusplus */ #endif /* __BLAS_INTERFACE_H__ */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 6f02978e1f..5062bd7c35 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -546,6 +546,25 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, } } +bool hasNaN(const size_t N, const float *X) { + bool temp = false; + size_t i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t vec = vld1q_f32(&X[i]); + uint32x4_t vcmp = vceqq_f32(vec, vec); + if (vaddvq_u32(vcmp)) + return true; + } + + while (i < N) { + if (X[i] != X[i]) + return true; + ++i; + } + + return temp; +} + #ifdef ENABLE_FP16 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N, @@ -1972,5 +1991,25 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) { } } +bool hasNaN(const size_t N, const __fp16 *input) { + bool temp = 0; + size_t i = 0; + for (; N - i >= 8; i += 8) { + float16x8_t vec = vld1q_f16(&input[i]); + uint16x8_t vcmp = vceqq_f16(vec, vec); + + if (vaddvq_u16(vcmp)) + return true; + } + + while (i < N) { + if (input[i] != input[i]) + return true; + ++i; + } + + return temp; +} + #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index db1b6a5ccc..6da5e952e1 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] input float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const float *input); + #ifdef ENABLE_FP16 /** * @brief hgemv computation with neon : Y = alpha*A*X + beta*Y @@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, * @param X __fp16 * for Vector X */ void inv_sqrt_inplace(const unsigned int N, __fp16 *X); + +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const __fp16 *X); #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 0884dbd3b4..b14fa0ee85 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -44,6 +44,12 @@ cl_headers = [ arch = host_machine.cpu_family() + +if get_option('enable-avx') + tensor_sources += 'blas_avx.cpp' + tensor_headers += 'blas_avx.h' +endif + if get_option('enable-fp16') if arch == 'arm' error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') @@ -55,9 +61,6 @@ if get_option('enable-fp16') nntrainer_inc += include_directories('hgemm') nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' endif - elif get_option('enable-avx') - tensor_sources += 'blas_avx.cpp' - tensor_headers += 'blas_avx.h' endif endif diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index b14bbd7ae4..f9db2e2ab0 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -3820,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const { return; } +bool Tensor::hasNaN() const { + if (getDataType() == Tdatatype::FP16) { +#ifdef ENABLE_FP16 + return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>()); +#else + throw std::invalid_argument("enble-fp16 is not set"); +#endif + } else { + return has_nan(dim.getDataLen(), Tdatatype::FP32, getData()); + } +} + // namespace nntrainer } /* namespace nntrainer */ diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 2ea0393e66..968ec4d502 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -2038,6 +2038,12 @@ class Tensor { static constexpr float epsilon = 1e-5; + /** + * @brief check if there is NaN element + * @param[out] bool true if there is NaN else false + */ + bool hasNaN() const; + private: /**< handle the data as a std::shared_ptr type */ TensorDim dim; diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec index 36ba371d22..2f1dc57f68 100644 --- a/packaging/nntrainer.spec +++ b/packaging/nntrainer.spec @@ -65,6 +65,13 @@ %define neon_support -Denable-neon=false %endif # arch aarch64 +%ifarch x86_64 +%define enable_avx 1 +%define avx_support -Denable-avx=true +%else +%define avx_support -Denable-avx=false +%endif # arch aarch64 + Name: nntrainer Summary: Software framework for training neural networks @@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \ %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \ -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \ -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \ - %{fp16_support} %{neon_support} build + %{fp16_support} %{neon_support} %{avx_support} build ninja -C build %{?_smp_mflags} @@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/ %{_includedir}/nntrainer/util_simd_neon.h %endif +%if 0%{?enable_avx} +%{_includedir}/nntrainer/blas_avx.h +%endif + %files devel-static %{_libdir}/libnntrainer*.a %exclude %{_libdir}/libcapi*.a diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build index 4a6e81e65d..3f17369f94 100644 --- a/test/unittest/models/meson.build +++ b/test/unittest/models/meson.build @@ -1,4 +1,5 @@ test_name = 'unittest_models' +mixed_test_name = 'unittest_mixed_models' test_target = [] @@ -11,8 +12,28 @@ models_targets = [ # disable temperally ] +mixed_test_targets = [ + 'models_test_utils.cpp', + 'models_golden_test.cpp', + 'unittest_models_mixed_precision.cpp', +] + if get_option('enable-fp16') - models_targets += 'unittest_models_mixed_precision.cpp' + mixed_exe = executable( + mixed_test_name, + mixed_test_targets, + include_directories: include_directories('.'), + dependencies: [ + nntrainer_test_main_deps, nntrainer_ccapi_dep + ], + install: get_option('enable-test'), + install_dir: application_install_dir + ) + + test(mixed_test_name, mixed_exe, + args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name), + timeout: test_timeout + ) endif test_target += models_targets From 364d80affafca442ba72582aa0d12b0202c79783 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Sat, 11 May 2024 14:00:04 +0900 Subject: [PATCH 07/14] [ Context ] Add loss scale in Context & using mse loss This PR add loss scale parameter in runcontext and use it to update mse loss. . Add Loss Scale Parameter in RunLayerContext Constructor . Add applyLossScale func to update return derivitive in Loss Layer . Change MSE Loss Layer to apply the loss scale to return derivitive **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/network_graph.cpp | 6 ++-- nntrainer/layers/layer_context.cpp | 3 +- nntrainer/layers/layer_context.h | 36 +++++++++++++++++--- nntrainer/layers/layer_node.cpp | 9 ++--- nntrainer/layers/layer_node.h | 3 +- nntrainer/layers/loss/loss_layer.cpp | 9 ++++- nntrainer/layers/loss/loss_layer.h | 7 ++++ nntrainer/layers/loss/mse_loss_layer.cpp | 13 ++++++- nntrainer/layers/time_dist.cpp | 16 ++++----- nntrainer/models/model_common_properties.h | 2 +- nntrainer/tensor/weight.cpp | 2 +- test/unittest/layers/layers_golden_tests.cpp | 2 +- test/unittest/layers/unittest_layer_node.cpp | 4 +-- 13 files changed, 85 insertions(+), 27 deletions(-) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index b7f4d1cffd..297cd3e881 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -880,7 +880,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode, lnode->getTrainable(), shared_weight_names), inputs, outputs, tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(), - lnode->getTrainable(), shared_tensor_names)); + lnode->getTrainable(), shared_tensor_names), + init_context.getLossScale()); return outputs; } @@ -1028,7 +1029,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr &lnode, // TODO: update weights spec for trainable based on layer trainable prop weights, inputs, outputs, tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(), - lnode->getTrainable(), shared_tensor_names)); + lnode->getTrainable(), shared_tensor_names), + init_context.getLossScale()); return outputs; } diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index f0856c1dbb..fbbc9ecaff 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -126,13 +126,14 @@ const std::vector &InitLayerContext::getOutSpecs() const { } RunLayerContext::RunLayerContext(const std::string &name, bool trainable, - float l, bool in_place_, + float l, bool in_place_, float loss_scale_, const std::vector &w, const std::vector &in, const std::vector &out, const std::vector &t) : loss(l), in_place(in_place_), + loss_scale(loss_scale_), weights(w), inputs(in), outputs(out), diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index e2f428aa2c..09bccc2c73 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -63,7 +63,7 @@ class InitLayerContext { const float max_norm = 0.0, std::array tensor_type_ = {"NCHW", "FP32", "FP32"}, - const float loss_scale = 0.0); + const float loss_scale = 1.0); /** * @brief get Tensor Format of Layer * @@ -348,6 +348,14 @@ class InitLayerContext { */ bool executeInPlace() const { return in_place; } + /** + * @brief get Initial value of Loss_Scale. This is set to RunLayerContext + * and updated + * + * @return loss_scale + */ + float getLossScale() const { return loss_scale; } + private: std::vector input_dim; /**< Input dimensions for the layer */ bool in_place; /**< if the layer is expected to run in-place */ @@ -385,7 +393,7 @@ class RunLayerContext { * @brief Construct a new Run Layer Context object * */ - RunLayerContext() : loss(0.0), in_place(false) {} + RunLayerContext() : loss(0.0), in_place(false), loss_scale(1.0) {} /** * @brief Construct a new Run Layer Context object @@ -396,6 +404,17 @@ class RunLayerContext { std::get(props).set(name); } + /** + * @brief Construct a new Run Layer Context object + * + */ + RunLayerContext(const std::string &name, bool in_place_, float loss_scale_) : + RunLayerContext() { + in_place = in_place_; + std::get(props).set(name); + loss_scale = loss_scale_; + } + /** * @brief Construct a new Run Layer Context object * @@ -403,13 +422,15 @@ class RunLayerContext { * @param trainable if the layer is trainable * @param l loss of the layer * @param in_place_ execution in-place of the layer + * @param loss_scale loss_scale of the layer * @param w weights of the layer * @param in inputs of the layer * @param out outputs of the layer * @param t extra tensors of the layer */ RunLayerContext(const std::string &name, bool trainable, float l, - bool in_place_, const std::vector &w, + bool in_place_, float loss_scale_, + const std::vector &w, const std::vector &in, const std::vector &out, const std::vector &t); @@ -883,10 +904,17 @@ class RunLayerContext { */ ml::train::LayerComputeEngine getComputeEngine() { return compute_engine; } + /** + * @brief get loss scale + * @return loss scale + */ + float getLossScale() { return loss_scale; } + private: std::tuple props; /**< props of the layer */ float loss; /**< loss of the layer */ - bool in_place; /**< if the layer is expected to run in-place */ + bool in_place; /**< if the layer is expected to run in-place */ + float loss_scale; /**< loss_scale of the layer */ std::vector weights; /**< weights of the layer */ std::vector inputs; /**< inputs of the layer */ diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp index 8b18d80762..f41752a4d8 100644 --- a/nntrainer/layers/layer_node.cpp +++ b/nntrainer/layers/layer_node.cpp @@ -599,7 +599,7 @@ InitLayerContext LayerNode::finalize(const std::vector &input_dims, const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom(); float max_norm = 0.0; - float loss_scale = 0.0; + float loss_scale = 1.0; if (!std::get(*layer_node_props).empty()) max_norm = std::get(*layer_node_props).get(); @@ -864,10 +864,11 @@ float LayerNode::getLoss() const { return *loss; } void LayerNode::configureRunContext(const std::vector &weights, const std::vector &inputs, const std::vector &outputs, - const std::vector &tensors) { + const std::vector &tensors, + float loss_scale) { run_context = std::make_unique( - getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, weights, - inputs, outputs, tensors); + getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, + loss_scale, weights, inputs, outputs, tensors); } /** diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index 7dfb1bd1a0..3fd2d55b97 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -820,7 +820,8 @@ class LayerNode final : public ml::train::Layer, public GraphNode { void configureRunContext(const std::vector &weights, const std::vector &inputs, const std::vector &outputs, - const std::vector &tensors); + const std::vector &tensors, + float loss_scale); /** * @brief Preset modes for printing summary for the layer diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp index 40f74717f8..ab2ccf8be2 100644 --- a/nntrainer/layers/loss/loss_layer.cpp +++ b/nntrainer/layers/loss/loss_layer.cpp @@ -22,7 +22,7 @@ void LossLayer::finalize(InitLayerContext &context) { d.setDataType( str_converter::from_string("FP32")); - + context.setOutputDimensions(output_dim); } @@ -36,6 +36,13 @@ void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) { context.setLoss(loss_sum / (float)l.batch()); } +void LossLayer::applyLossScale(RunLayerContext &context, Tensor &ret_deriv) { + + float loss_scale = context.getLossScale(); + if (loss_scale != 1.0) + ret_deriv.multiply_i(loss_scale); +} + /** * @copydoc Layer::setProperty(const std::vector &values) */ diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h index 00b520f6e6..581e9477a8 100644 --- a/nntrainer/layers/loss/loss_layer.h +++ b/nntrainer/layers/loss/loss_layer.h @@ -60,6 +60,13 @@ class LossLayer : public Layer { */ void updateLoss(RunLayerContext &context, const Tensor &l); + /** + * @brief update return derivative with loss scale + * @param context Run context to update + * @param return_dev Tensor data to calculate + */ + void applyLossScale(RunLayerContext &context, Tensor &l); + Tensor l; /**< loss tensor to store intermediate value to calculate loss value */ }; diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index ec9bc9b844..3aed8125e0 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -50,8 +50,17 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) { } void MSELossLayer::calcDerivative(RunLayerContext &context) { + Tensor empty_tensor; Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX); - const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); + const Tensor &y2_ = context.getIncomingDerivative(SINGLE_INOUT_IDX); + Tensor &y2 = empty_tensor; + + if (ret_derivative.getDataType() == ml::train::TensorDim::DataType::FP32) + y2 = y2_; + + if (y2.empty()) + y2 = y2_.clone(ret_derivative.getDataType()); + Tensor &y = context.getInput(SINGLE_INOUT_IDX); y.subtract(y2, ret_derivative); @@ -60,6 +69,8 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { throw std::runtime_error( "[MSELossLayer::calcDerivative] Error when calculating loss"); } + + LossLayer::applyLossScale(context, ret_derivative); } } // namespace nntrainer diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp index 80451416df..779010065a 100644 --- a/nntrainer/layers/time_dist.cpp +++ b/nntrainer/layers/time_dist.cpp @@ -256,8 +256,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->forwarding(dist_context, training); } @@ -303,8 +303,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcDerivative(dist_context); } @@ -354,8 +354,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcGradient(dist_context); } @@ -396,8 +396,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->setBatch(dist_context, batch); diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h index 3776afefca..3435d18e96 100644 --- a/nntrainer/models/model_common_properties.h +++ b/nntrainer/models/model_common_properties.h @@ -217,7 +217,7 @@ class ModelTensorDataType final : public EnumProperty { */ class LossScale : public Property { public: - LossScale(float value = 0.0f); + LossScale(float value = 1.0f); static constexpr const char *key = "loss_scale"; /**< unique key to access */ using prop_tag = float_prop_tag; /**< property type */ }; diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index d8db5ba094..df262f50d9 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -99,7 +99,7 @@ Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32, decay(0.0f), clip_by_global_norm(0.0f), output_axis(output_axis_), - loss_scale(0.0), + loss_scale(1.0), var32(std::make_shared(n + ":fp32")) { if (!g.empty() && isMixedPrecision()) { diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp index 64400e6ecd..c71d653c05 100644 --- a/test/unittest/layers/layers_golden_tests.cpp +++ b/test/unittest/layers/layers_golden_tests.cpp @@ -156,7 +156,7 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) { }; auto rc = - RunLayerContext("golden", true, 0.0f, false, create_view(weights), + RunLayerContext("golden", true, 0.0f, false, 1.0, create_view(weights), create_view(ins), create_view(outs), create_view(tensors)); auto num_outputs = rc.getNumOutputs(); diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp index 3b41f02f30..37287f7ce5 100644 --- a/test/unittest/layers/unittest_layer_node.cpp +++ b/test/unittest/layers/unittest_layer_node.cpp @@ -131,7 +131,7 @@ TEST(nntrainer_LayerNode, finalize_05_n) { nntrainer::createLayerNode(nntrainer::IdentityLayer::type)); EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"})); EXPECT_NO_THROW(lnode->finalize()); - EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {})); + EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}, 1.0)); EXPECT_THROW(lnode->finalize(), std::runtime_error); } @@ -298,7 +298,7 @@ TEST(nntrainer_LayerNode, setWeights_02_n) { EXPECT_NO_THROW(lnode = nntrainer::createLayerNode(nntrainer::IdentityLayer::type)); EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"})); - EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {})); + EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}, 1.0)); EXPECT_THROW(lnode->setWeights(new_weights), std::runtime_error); } From 77741a2341e0b7e0dbfcd8c8fd9e6d8db09c7df7 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Mon, 13 May 2024 17:09:00 +0900 Subject: [PATCH 08/14] [ Mixed Precision ] Enable Mixed Precision This PR enables the Mixed Precision Training. For now only FP16-FP32 is considered. Additional Test cases will be added. . add getSortedLayerIdx to set the graph order for fowarding. . change clip_weights to lazy_apply_weights to use both cases. . add fowarding_op to run forwarding from that layer which has a gradient with nan. . add while loop for re-run backwarding after reset the loss scale. . add setLossScale in RunLayerContext . add check the gradient if mixed precsion enable. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/graph_core.cpp | 9 +++ nntrainer/graph/graph_core.h | 8 ++ nntrainer/graph/network_graph.cpp | 113 ++++++++++++++++++++++------- nntrainer/graph/network_graph.h | 28 +++++-- nntrainer/layers/layer_context.cpp | 11 +++ nntrainer/layers/layer_context.h | 26 +++++++ nntrainer/models/neuralnet.cpp | 43 +++++++++-- nntrainer/tensor/blas_avx.cpp | 4 +- nntrainer/tensor/weight.cpp | 2 +- nntrainer/tensor/weight.h | 7 ++ 10 files changed, 207 insertions(+), 44 deletions(-) diff --git a/nntrainer/graph/graph_core.cpp b/nntrainer/graph/graph_core.cpp index b624e066e4..3eafbb9261 100644 --- a/nntrainer/graph/graph_core.cpp +++ b/nntrainer/graph/graph_core.cpp @@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const { return Sorted.at(ith); } +const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const { + return sorted_node_map.at(name); +} + void GraphCore::makeAdjacencyList( std::vector>> &adj) { /** initialize the adj list */ @@ -93,6 +97,11 @@ void GraphCore::topologicalSort() { if (Sorted.size() != node_list.size()) throw std::runtime_error("Internal error in topologicalSort"); + unsigned int idx = 0; + for (auto n : Sorted) { + sorted_node_map[n->getName()] = idx; + idx++; + } } const std::shared_ptr & diff --git a/nntrainer/graph/graph_core.h b/nntrainer/graph/graph_core.h index 83d3ce7c39..77aa63666a 100644 --- a/nntrainer/graph/graph_core.h +++ b/nntrainer/graph/graph_core.h @@ -91,6 +91,13 @@ class GraphCore { */ const std::shared_ptr &getSortedNode(unsigned int ith) const; + /** + * @brief getter of Sorted GraphNode index with name + * @param[in] layer name + * @ret index + */ + const unsigned int getSortedNodeIdx(const std::string &name) const; + /** * @brief getter of GraphNode with node name * @param[in] node name @@ -252,6 +259,7 @@ class GraphCore { std::vector> node_list; /**< Unordered Node List */ std::unordered_map node_map; /**< Unordered Node map */ + std::unordered_map sorted_node_map; /**< Unordered Node map */ std::vector> Sorted; /**< Ordered Node List */ bool sorted; /** if the node_list is sorted */ diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index 297cd3e881..ac703e490b 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -337,7 +337,7 @@ void NetworkGraph::applyGradients( continue; } - if (rc.isGradientClipByGlobalNorm(i)) { + if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) { /** * @note the weights whose gradient are to be clipped by global norm will * be clipped at once at the end of iteration and applied then. @@ -393,56 +393,100 @@ sharedConstTensors NetworkGraph::incremental_forwarding( return out; } -void NetworkGraph::backwarding( +bool NetworkGraph::backwarding( int iteration, - std::function, int)> &backwarding_op, - std::function &apply_grad_clip_op, - std::function stop_cb, void *userdata) const { + std::function, bool)> &forwarding_op, + std::function, int)> &backwarding_op, + std::function &lazy_apply_grad_op, + std::function stop_cb, void *userdata) { /** * last layer backwarding is run out of this loop */ auto iter_begin = getBackwardingBeginIter(); auto iter_end = getBackwardingEndIter(); + bool has_nan = false; /// there is no layer to train, so backwarding is essentially noop if (iter_begin == iter_end) { - return; + return true; } auto const &lptr_begin = (*iter_begin); + // graph_const_reverse_iterator + auto iter_ = iter_begin; if (lptr_begin->requireLabel() == false) throw std::runtime_error( "Error: last layer does not accept label, we can't train"); - for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) { - auto &ln = *iter; + for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) { + auto &ln = *iter_; PROFILE_TIME_START(profile_keys.at(ln->getType())); - backwarding_op(ln, iteration); + has_nan = backwarding_op(ln, iteration); PROFILE_TIME_END(profile_keys.at(ln->getType())); + + if (has_nan) { + std::cout << "Gradient has NaN" << std::endl; + break; + } } - /** perform clipping of the gradients by global norm if any */ - if (clip_weights.empty()) - return; + if (has_nan) { + /** if has NaN + * 1. reset the loss scale. + * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata); + * 3. return false --> run backwarding again; + */ + float scale = (*iter_)->getRunContext().getLossScale(); + float s = scale > 1.5f ? scale - 0.5f : 1.0f; + + resetLossScale(s); - /** calculate the global norm */ - Tensor global_norm_t( - TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()})); - float *global_norm_data = global_norm_t.getData(); - for (unsigned int idx = 0; idx < clip_weights.size(); idx++) { - auto const &w = clip_weights[idx]; - global_norm_data[idx] = w->getGradientNorm(); + auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName()); + + for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) { + auto &ln = *iter; + PROFILE_TIME_START(profile_keys.at(ln->getType())); + forwarding_op(*iter, true); + PROFILE_TIME_END(profile_keys.at(ln->getType())); + } + + return false; } - float global_norm = global_norm_t.l2norm(); - /** apply the gradient with the above global norm */ - for (auto w : clip_weights) { - w->clipGradientByGlobalNorm(global_norm); + + /** perform clipping of the gradients by global norm if any */ + if (lazy_weights.empty()) + return true; + + if (is_clip_grad) { + /** calculate the global norm */ + Tensor global_norm_t( + TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()})); + float *global_norm_data = global_norm_t.getData(); + for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) { + auto const &w = lazy_weights[idx]; + global_norm_data[idx] = w->getGradientNorm(); + } + float global_norm = global_norm_t.l2norm(); + /** apply the gradient with the above global norm */ + for (auto w : lazy_weights) { + w->clipGradientByGlobalNorm(global_norm); + } } /** apply the gradient with the above global norm */ - for (auto w : clip_weights) { - apply_grad_clip_op(*w, iteration); + for (auto w : lazy_weights) { + lazy_apply_grad_op(*w, iteration); + } + nan_count++; + + if (nan_count > 10) { + float scale = (*iter_)->getRunContext().getLossScale(); + float s = scale + 2.0f; + resetLossScale(s); + nan_count = 0; } + + return true; } LayerNode *NetworkGraph::computeBackwardEnd() { @@ -1290,11 +1334,19 @@ int NetworkGraph::initialize(ExecutionMode mode, /** select weights which would require clipping of the gradients by global * norm if any */ - clip_weights = tensor_manager->getWeights([](const Weight *w) { + lazy_weights = tensor_manager->getWeights([](const Weight *w) { return w->hasGradient() && w->isGradientLastAccess() && - w->isGradientClipByGlobalNorm(); + (w->isGradientClipByGlobalNorm() || w->isMixedPrecision()); }); + is_clip_grad = false; + for (auto w : lazy_weights) { + if (w->isGradientClipByGlobalNorm()) { + is_clip_grad = true; + break; + } + } + return ML_ERROR_NONE; } @@ -1566,4 +1618,11 @@ void NetworkGraph::requestOptimizerVariable( } } +void NetworkGraph::resetLossScale(float scale) { + for (auto iter = cbegin(); iter != cend(); iter++) { + auto &ln = *iter; + ln->getRunContext().setLossScale(scale); + } +} + } /* namespace nntrainer */ diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h index 5c9adf0363..22f14e1b73 100644 --- a/nntrainer/graph/network_graph.h +++ b/nntrainer/graph/network_graph.h @@ -51,7 +51,9 @@ class NetworkGraph { optimize_memory(true), exec_mode(ExecutionMode::TRAIN), tensor_format("NCHW"), - tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {} + tensor_dtype(split("FP32-FP32", getRegex("\\-"))) { + nan_count = 0; + } /** * @brief Constructor of NeuralNetwork Graph Class @@ -73,7 +75,9 @@ class NetworkGraph { optimize_memory(true), exec_mode(ExecutionMode::TRAIN), tensor_format(tensor_format_), - tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {} + tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) { + nan_count = 0; + } /** * @brief Destructor of the NeuralNetwork Graph class @@ -206,13 +210,14 @@ class NetworkGraph { * @param[in] backwarding_op operation for the backwarding * @param[in] apply_grad_clip_op operation for applying the clip gradients */ - void backwarding( + bool backwarding( int iteration, - std::function, int)> &backwarding_op, - std::function &apply_grad_clip_op, + std::function, bool)> &forwarding_op, + std::function, int)> &backwarding_op, + std::function &lazy_apply_grad_op, std::function stop_cb = [](void *user_data) { return false; }, - void *user_data = nullptr) const; + void *user_data = nullptr); /** * @brief get begin iterator for the graph @@ -444,6 +449,12 @@ class NetworkGraph { getLayerExecutionOrders(const std::shared_ptr &lnode); #endif // ENABLE_TEST + /** + * @brief reset the loss scale + * @param[in] scale + */ + void resetLossScale(float scale); + private: std::map sub_in_out; /** This is map to identify input and output layer name of subgraph */ @@ -480,7 +491,10 @@ class NetworkGraph { std::unordered_map profile_keys; /**< profile keys based on the layer type */ std::vector - clip_weights; /**< weights with global norm based clipping enabled */ + lazy_weights; /**< weights with global norm based clipping enabled */ + bool is_clip_grad; + + unsigned int nan_count; /** * @brief topological sort diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index fbbc9ecaff..5862e6af14 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -416,6 +416,17 @@ bool RunLayerContext::isGradientClipByGlobalNorm(unsigned int idx) const { return weights[idx]->isGradientClipByGlobalNorm(); } +bool RunLayerContext::isMixedPrecision(unsigned int idx) const { + return weights[idx]->isMixedPrecision(); +} + +bool RunLayerContext::isMixedPrecision() const { + for (auto w : weights) + if (w->isMixedPrecision()) + return true; + return false; +} + /** * @brief Get the tensor name * diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index 09bccc2c73..c68c42f11d 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -689,6 +689,20 @@ class RunLayerContext { */ bool isGradientClipByGlobalNorm(unsigned int idx) const; + /** + * @brief check if the weight is mixed precsion + * + * @param idx index + * @return bool true if it is mixed precision + */ + bool isMixedPrecision(unsigned int idx) const; + + /** + * @brief check if the weight is mixed precsion + * @return bool true if it is mixed precision + */ + bool isMixedPrecision() const; + /** * @brief Get the tensor name * @@ -910,6 +924,18 @@ class RunLayerContext { */ float getLossScale() { return loss_scale; } + /** + * @brief set Loss_Scale. + * + * @return loss_scale + */ + void setLossScale(float scale) { + loss_scale = scale; + for (auto w : weights) { + w->setLossScale(scale); + } + } + private: std::tuple props; /**< props of the layer */ float loss; /**< loss of the layer */ diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp index d0e542825f..f7c0914d32 100644 --- a/nntrainer/models/neuralnet.cpp +++ b/nntrainer/models/neuralnet.cpp @@ -412,9 +412,21 @@ void NeuralNetwork::backwarding(int iteration, NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!"; #endif - std::function, int)> backwarding_op = + std::function, bool)> forwarding_op = + [this, stop_cb, userdata](std::shared_ptr node, + bool training) -> void { + (void)this; + PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName()); + + auto f = std::get<0>(node->getExecutionOrder()); + model_graph.flushCacheExcept(f); + + node->forwarding(training); + }; + + std::function, int)> backwarding_op = [this, stop_cb, userdata](std::shared_ptr node, - int iteration) -> void { + int iteration) -> bool { /** * Do not change this order: * 1. calcGradient @@ -448,19 +460,29 @@ void NeuralNetwork::backwarding(int iteration, /** If gradient must be applied and its not gradient mode, calculate * gradient */ - if (!dynamic_training_opt.isGradientMode() && apply_gradient) + if (!dynamic_training_opt.isGradientMode() && apply_gradient) { node->calcGradient(); + + RunLayerContext &rc = node->getRunContext(); + if (rc.isMixedPrecision()) { + for (auto w : rc.getWeights()) { + if (w->getGradientRef().hasNaN()) + return true; + } + } + } } model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder())); PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName()); if (stop_cb(userdata)) { - return; + return false; } - if (node->needsCalcDerivative()) + if (node->needsCalcDerivative()) { node->calcDerivative(); + } model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder())); PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName()); @@ -476,9 +498,10 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }); } + return false; }; - std::function apply_grad_clip_op = + std::function lazy_apply_grad_op = [opt_ = opt.get()](Weight &w, int iteration) -> void { w.calcRegularizationGradient(); w.calcWeightDecayGradient(); @@ -487,8 +510,12 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }; - model_graph.backwarding(iteration, backwarding_op, apply_grad_clip_op, - stop_cb, userdata); + bool ret = false; + + while (!ret) { + ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op, + lazy_apply_grad_op, stop_cb, userdata); + } } void NeuralNetwork::save(const std::string &file_path, diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index 2fd4908463..ce4d8de47f 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -127,7 +127,7 @@ bool hasNaN(const size_t N, const _Float16 *input) { const __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); const __m256 vec1 = - _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8)); + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8))); input += 16; @@ -161,6 +161,7 @@ bool hasNaN(const size_t N, const _Float16 *input) { return true; } ++input; + ++idx; } return false; @@ -205,6 +206,7 @@ bool hasNaN(const size_t N, const float *input) { return true; } ++input; + ++idx; } return false; diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index df262f50d9..0e9879540a 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -153,7 +153,7 @@ void Weight::quantizeWeight() { // NYI break; case ml::train::TensorDim::DataType::FP16: - getVariableRef().copy(getVariableFP32Ref()); + getVariableRef().copyData(getVariableFP32Ref()); break; case ml::train::TensorDim::DataType::FP32: break; diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 5382c686e1..8ac3aa0190 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -349,6 +349,13 @@ class Weight : public Var_Grad { */ void quantizeWeight(); + /** + * @brief set loss scale + * param[in] scale + * + */ + void setLossScale(float scale) { loss_scale = scale; }; + private: static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */ static constexpr float epsilon_decay = From a19b7df205b4b8845ca48aacd7c9541b9c41f2aa Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Tue, 14 May 2024 09:57:38 +0900 Subject: [PATCH 09/14] [ Tensor ] Add inifinity check in Tensor This PR add inifinity value check in Tensor data. . rename the hasNaN to isValid . add infinity check in isValid Function and now it check NaN and Inf . modify to check the blas_avx and blas_neon . modify graph and model check is_valid rather than has_nan . add unittest of isValid Function **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/graph/network_graph.cpp | 8 +- nntrainer/models/neuralnet.cpp | 9 +- nntrainer/tensor/blas_avx.cpp | 104 ++++++++++++++---- nntrainer/tensor/blas_avx.h | 12 +- nntrainer/tensor/blas_interface.cpp | 22 ++-- nntrainer/tensor/blas_interface.h | 8 +- nntrainer/tensor/blas_neon.cpp | 52 ++++++--- nntrainer/tensor/blas_neon.h | 16 +-- nntrainer/tensor/tensor.cpp | 6 +- nntrainer/tensor/tensor.h | 6 +- test/unittest/unittest_nntrainer_tensor.cpp | 24 ++++ .../unittest_nntrainer_tensor_fp16.cpp | 28 +++++ .../unittest_nntrainer_tensor_neon_fp16.cpp | 32 ++++++ 13 files changed, 250 insertions(+), 77 deletions(-) diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index ac703e490b..821731e949 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -404,7 +404,7 @@ bool NetworkGraph::backwarding( */ auto iter_begin = getBackwardingBeginIter(); auto iter_end = getBackwardingEndIter(); - bool has_nan = false; + bool is_valid = true; /// there is no layer to train, so backwarding is essentially noop if (iter_begin == iter_end) { @@ -422,16 +422,16 @@ bool NetworkGraph::backwarding( for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) { auto &ln = *iter_; PROFILE_TIME_START(profile_keys.at(ln->getType())); - has_nan = backwarding_op(ln, iteration); + is_valid = backwarding_op(ln, iteration); PROFILE_TIME_END(profile_keys.at(ln->getType())); - if (has_nan) { + if (!is_valid) { std::cout << "Gradient has NaN" << std::endl; break; } } - if (has_nan) { + if (!is_valid) { /** if has NaN * 1. reset the loss scale. * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata); diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp index f7c0914d32..afc560603e 100644 --- a/nntrainer/models/neuralnet.cpp +++ b/nntrainer/models/neuralnet.cpp @@ -466,8 +466,8 @@ void NeuralNetwork::backwarding(int iteration, RunLayerContext &rc = node->getRunContext(); if (rc.isMixedPrecision()) { for (auto w : rc.getWeights()) { - if (w->getGradientRef().hasNaN()) - return true; + if (!w->getGradientRef().isValid()) + return false; } } } @@ -477,7 +477,7 @@ void NeuralNetwork::backwarding(int iteration, PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName()); if (stop_cb(userdata)) { - return false; + return true; } if (node->needsCalcDerivative()) { @@ -498,7 +498,7 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }); } - return false; + return true; }; std::function lazy_apply_grad_op = @@ -510,6 +510,7 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }; + // return false if the gradient is not valid bool ret = false; while (!ret) { diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index ce4d8de47f..411dbcbb5d 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -115,101 +115,163 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } -bool hasNaN(const size_t N, const _Float16 *input) { +bool isValid(const size_t N, const _Float16 *input) { assert(N != 0); assert(input != NULL); int temp = 0; size_t idx = 0; + const __m256 SIGN_MASK = _mm256_set1_ps(-0.0); + const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity()); + // 16 single-precision check : ( X != X ) for (; N - idx >= 16; idx += 16) { - const __m256 vec0 = - _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); - const __m256 vec1 = + __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + __m256 vec1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8))); input += 16; + // check NaN in vec0 __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); + if (temp) + return false; + + // check infinity in vec0 + vec0 = _mm256_andnot_ps(SIGN_MASK, vec0); + vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ); + temp = temp | _mm256_movemask_ps(vec0); if (temp) - return true; + return false; + // check NaN in vec1 __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res1); if (temp) - return true; + return false; + + // check infinity in vec1 + vec1 = _mm256_andnot_ps(SIGN_MASK, vec1); + vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec1); + + if (temp) + return false; } // 8 single-precision check : ( X != X ) for (; N - idx >= 8; idx += 8) { - const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); input += 8; __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); if (temp) - return true; + return false; + + // check infinity in vec1 + vec = _mm256_andnot_ps(SIGN_MASK, vec); + vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec); + + if (temp) + return false; } - // remain check : ( X != X ) + // remain check : ( X != X || X == Inf ) while (idx < N) { - if (*input != *input) { - return true; + if (*input != *input || *input == std::numeric_limits::infinity()) { + return false; } ++input; ++idx; } - return false; + return true; } #endif -bool hasNaN(const size_t N, const float *input) { +bool isValid(const size_t N, const float *input) { assert(N != 0); assert(input != NULL); int temp = 0; size_t idx = 0; + const __m256 SIGN_MASK = _mm256_set1_ps(-0.0); + const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity()); + // 16 single-precision check : ( X != X ) for (; N - idx >= 16; idx += 16) { - const __m256 vec0 = _mm256_loadu_ps(input); - const __m256 vec1 = _mm256_loadu_ps(input + 8); + __m256 vec0 = _mm256_loadu_ps(input); + __m256 vec1 = _mm256_loadu_ps(input + 8); input += 16; __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); + + if (temp) + return false; + + // check infinity in vec0 + vec0 = _mm256_andnot_ps(SIGN_MASK, vec0); + vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec0); + if (temp) + return false; + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res1); if (temp) - return true; + return false; + + // check infinity in vec1 + vec1 = _mm256_andnot_ps(SIGN_MASK, vec1); + vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec1); + + if (temp) + return false; } // 8 single-precision check : ( X != X ) for (; N - idx >= 8; idx += 8) { - const __m256 vec = _mm256_loadu_ps(input); + __m256 vec = _mm256_loadu_ps(input); input += 8; __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); if (temp) - return true; + return false; + + // check infinity in vec + vec = _mm256_andnot_ps(SIGN_MASK, vec); + vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec); + + if (temp) + return false; } // remain check : ( X != X ) while (idx < N) { - if (*input != *input) { - return true; + if (*input != *input || *input == std::numeric_limits::infinity()) { + return false; } ++input; ++idx; } - return false; + return true; } } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h index d25ded103f..5eabcbdb2c 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/blas_avx.h @@ -43,22 +43,22 @@ void vcvt_f32_f16(size_t N, const float *input, void *output); /** * @brief check if the X has NaN value - * @note it compare !(x==x) + * @note it compare (x!=x || x == inf) * @param[in] N length of the vector * @param[in] X half-precision * for Vector X - * @param[out] true if it has NaN + * @param[out] false if it has NaN or inf */ -bool hasNaN(const size_t N, const _Float16 *X); +bool isValid(const size_t N, const _Float16 *X); #endif /** * @brief check if the X has NaN value - * @note it compare !(x==x) + * @note it compare (x!=x || x == inf) * @param[in] N length of the vector * @param[in] X float * for Vector X - * @param[out] true if it has NaN + * @param[out] false if it has NaN or inf */ -bool hasNaN(const size_t N, const float *X); +bool isValid(const size_t N, const float *X); } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 78cb708e53..6219919fd8 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -1038,14 +1038,14 @@ static void ele_div_fallback(const unsigned int N, const float *X, } } -static bool has_nan_fallback(const size_t N, const float *X) { +static bool is_valid_fallback(const size_t N, const float *X) { for (size_t i = 0; i < N; ++i) { - if (*X != *X) - return true; + if (*X != *X || *X == std::numeric_limits::infinity()) + return false; ++X; } - return false; + return true; } void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, @@ -1100,15 +1100,15 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); } -bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, - const void *X) { +bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X) { if (d_type == ml::train::TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 const _FP16 *vec = (const _FP16 *)X; #ifdef USE_NEON - return nntrainer::neon::hasNaN(N, vec); + return nntrainer::neon::isValid(N, vec); #elif defined(USE_AVX) - return nntrainer::avx::hasNaN(N, vec); + return nntrainer::avx::isValid(N, vec); #else throw std::invalid_argument("Error: enable-fp16 is not enabled"); #endif @@ -1116,12 +1116,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, } else if (d_type == ml::train::TensorDim::DataType::FP32) { const float *vec = (const float *)X; #ifdef USE_NEON - return nntrainer::neon::hasNaN(N, vec); + return nntrainer::neon::isValid(N, vec); #elif defined(USE_AVX) - return nntrainer::avx::hasNaN(N, vec); + return nntrainer::avx::isValid(N, vec); #endif - return has_nan_fallback(N, vec); + return is_valid_fallback(N, vec); } return false; } diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index bcd557111e..2b5ef72922 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -480,13 +480,13 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, unsigned int o_stride = 1); /** - * @brief check if X array has NaN + * @brief check if X array has NaN or inf * @param[in] N length of the vector * @param[in] X float/fp16 * for Vector X - * @param[out] bool true if NaN else false + * @param[out] bool false if not valide else true */ -bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, - const void *X); +bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X); } /* namespace nntrainer */ #endif /* __cplusplus */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 5062bd7c35..20f4d102ec 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -546,23 +546,34 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, } } -bool hasNaN(const size_t N, const float *X) { - bool temp = false; +bool isValid(const size_t N, const float *X) { size_t i = 0; + float inf_s = std::numeric_limits::infinity(); + float32x4_t inf = vdupq_n_f32(inf_s); + uint16x8_t zero = vdupq_n_f32(0); + for (; N - i >= 4; i += 4) { float32x4_t vec = vld1q_f32(&X[i]); uint32x4_t vcmp = vceqq_f32(vec, vec); + + vcmp = vceqq_f32(vcmp, zero); + if (vaddvq_u32(vcmp)) - return true; + return false; + + vcmp = vceqq_f32(vec, inf); + + if (vaddvq_u16(vcmp)) + return false; } while (i < N) { - if (X[i] != X[i]) - return true; + if (X[i] != X[i] || X[i] == std::numeric_limits::infinity()) + return false; ++i; } - return temp; + return true; } #ifdef ENABLE_FP16 @@ -1991,24 +2002,39 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) { } } -bool hasNaN(const size_t N, const __fp16 *input) { +bool isValid(const size_t N, const __fp16 *input) { bool temp = 0; size_t i = 0; + __fp16 inf_s = std::numeric_limits::infinity(); + float16x8_t inf = vdupq_n_f16(inf_s); + uint16x8_t zero = vdupq_n_f16(0); + for (; N - i >= 8; i += 8) { float16x8_t vec = vld1q_f16(&input[i]); + uint16x8_t vcmp = vceqq_f16(vec, vec); - if (vaddvq_u16(vcmp)) - return true; + vcmp = vceqq_f16(vcmp, zero); + + if (vaddvq_u16(vcmp)) { + return false; + } + + vcmp = vceqq_f16(vec, inf); + + if (vaddvq_u16(vcmp)) { + return false; + } } while (i < N) { - if (input[i] != input[i]) - return true; + if (input[i] != input[i] || + input[i] == std::numeric_limits::infinity()) { + return false; + } ++i; } - - return temp; + return true; } #endif diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 6da5e952e1..978d3428f7 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -149,13 +149,13 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief check if the X has NaN value - * @note it compare !(x==x) + * @brief check if the X has NaN value or Inf + * @note it compare (x!=x || x == inf) * @param[in] N length of the vector * @param[in] input float * for Vector X - * @param[out] true if it has NaN + * @param[out] false if it has NaN or Inf */ -bool hasNaN(const size_t N, const float *input); +bool isValid(const size_t N, const float *input); #ifdef ENABLE_FP16 /** @@ -391,13 +391,13 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, void inv_sqrt_inplace(const unsigned int N, __fp16 *X); /** - * @brief check if the X has NaN value - * @note it compare !(x==x) + * @brief check if the X is valid: Check NaN or Inf + * @note it compare (x!=x || x == inf) * @param[in] N length of the vector * @param[in] X float * for Vector X - * @param[out] true if it has NaN + * @param[out] false if it has NaN or Inf */ -bool hasNaN(const size_t N, const __fp16 *X); +bool isValid(const size_t N, const __fp16 *X); #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index f9db2e2ab0..827ba7e979 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -3820,15 +3820,15 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const { return; } -bool Tensor::hasNaN() const { +bool Tensor::isValid() const { if (getDataType() == Tdatatype::FP16) { #ifdef ENABLE_FP16 - return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>()); + return is_valid(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>()); #else throw std::invalid_argument("enble-fp16 is not set"); #endif } else { - return has_nan(dim.getDataLen(), Tdatatype::FP32, getData()); + return is_valid(dim.getDataLen(), Tdatatype::FP32, getData()); } } diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 968ec4d502..ad3781526f 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -2039,10 +2039,10 @@ class Tensor { static constexpr float epsilon = 1e-5; /** - * @brief check if there is NaN element - * @param[out] bool true if there is NaN else false + * @brief check if there is NaN or Inf element + * @param[out] bool false if there is NaN or Inf else false */ - bool hasNaN() const; + bool isValid() const; private: /**< handle the data as a std::shared_ptr type */ diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp index 94aa01836d..d5b6a028f9 100644 --- a/test/unittest/unittest_nntrainer_tensor.cpp +++ b/test/unittest/unittest_nntrainer_tensor.cpp @@ -4704,6 +4704,30 @@ TEST(nntrainer_Tensor, inv_sqrt_i_uncontiguous_p) { } } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); +} + int main(int argc, char **argv) { int result = -1; diff --git a/test/unittest/unittest_nntrainer_tensor_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_fp16.cpp index 2b0d9c040d..58455757c5 100644 --- a/test/unittest/unittest_nntrainer_tensor_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_fp16.cpp @@ -6196,6 +6196,34 @@ TEST(nntrainer_Tensor, dequantize_06_p) { EXPECT_EQ(output, answer3); } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, std::numeric_limits::infinity()); + + EXPECT_EQ(input.isValid(), false); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1; diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp index d62be87aaf..799a910273 100644 --- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp @@ -1180,6 +1180,38 @@ TEST(nntrainer_Tensor, inv_sqrt_i_p) { EXPECT_EQ(flag, true); } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, std::numeric_limits::infinity()); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, 1); + + EXPECT_EQ(input.isValid(), true); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1; From 139ee5a6eb43ddf707ec990e821f9d17192bbb1a Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Fri, 17 May 2024 13:31:55 +0900 Subject: [PATCH 10/14] [ MSE ] Fix for better MSE loss precision This PR chage the loss computation using full precsion rather than half precsion to maintain accuracy. **Changes proposed in this PR:** - Added TOC generator for README.md Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/layers/loss/mse_loss_layer.cpp | 34 ++++++++++++++++++------ 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index 3aed8125e0..ed4390655d 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -51,17 +51,27 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) { void MSELossLayer::calcDerivative(RunLayerContext &context) { Tensor empty_tensor; - Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX); - const Tensor &y2_ = context.getIncomingDerivative(SINGLE_INOUT_IDX); - Tensor &y2 = empty_tensor; - if (ret_derivative.getDataType() == ml::train::TensorDim::DataType::FP32) - y2 = y2_; + Tensor &ret_derivative = + context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getOutgoingDerivative(SINGLE_INOUT_IDX) + : empty_tensor; - if (y2.empty()) - y2 = y2_.clone(ret_derivative.getDataType()); + if (ret_derivative.empty()) + ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); - Tensor &y = context.getInput(SINGLE_INOUT_IDX); + Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getInput(SINGLE_INOUT_IDX) + : empty_tensor; + + if (y.empty()) + y = context.getInput(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); + + const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); y.subtract(y2, ret_derivative); float divider = ((float)y.size()) / 2; @@ -70,7 +80,15 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { "[MSELossLayer::calcDerivative] Error when calculating loss"); } + // Loss Scale needs Full precsiion of ret_derivative. Therefore, + // ret_derivateive should be FP32 when applying scale, and after applying it + // need to convert original type for backpropagating. + LossLayer::applyLossScale(context, ret_derivative); + + if (context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() != + ml::train::TensorDim::DataType::FP32) + context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(ret_derivative); } } // namespace nntrainer From 0d52e0de8bac71996f8bc0f73df7546369b76dee Mon Sep 17 00:00:00 2001 From: Jiho Chu Date: Wed, 6 Mar 2024 11:12:02 +0900 Subject: [PATCH 11/14] [Test] Add conv2d test for fp16 It adds tests for conv2d fp16 test. Signed-off-by: Jiho Chu --- nntrainer/optimizers/adam.cpp | 56 ++++-- nntrainer/optimizers/optimizer_context.cpp | 15 ++ nntrainer/optimizers/optimizer_context.h | 15 ++ .../layers/unittest_layers_convolution2d.cpp | 182 ++++++++++++++++++ 4 files changed, 257 insertions(+), 11 deletions(-) diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp index 530e7fdf31..e2d611b7fd 100644 --- a/nntrainer/optimizers/adam.cpp +++ b/nntrainer/optimizers/adam.cpp @@ -95,26 +95,60 @@ void Adam::applyGradient(RunOptimizerContext &context) { Tensor &wm = context.getOptimizerVariable(AdamParams::wm); Tensor &wv = context.getOptimizerVariable(AdamParams::wv); - wm.multiply_i(beta1); - wm.add_i(x_grad, 1.0f - beta1); + if (context.getNumOptMasterVariable() != 0) { + Tensor &wm_m = context.getOptimizerMasterVariable(AdamParams::wm); + Tensor &wv_m = context.getOptimizerMasterVariable(AdamParams::wv); + Tensor x_grad_ = x_grad.clone(wm_m.getDataType()); - wv.multiply_i(beta2); - wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2); + wm_m.multiply_i(beta1); + wm_m.add_i(x_grad_, 1.0f - beta1); + + wv_m.multiply_i(beta2); + wv_m.add_i(x_grad_.multiply(x_grad_), 1.0f - beta2); + + wm.copyData(wm_m); + wv.copyData(wv_m); + } else { + wm.multiply_i(beta1); + wm.add_i(x_grad, 1.0f - beta1); + + wv.multiply_i(beta2); + wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2); + } if (torch_ref) { - Tensor denom = wv.apply(sqrtFloat); - denom.divide_i(sqrtFloat(biasCorrection2)); - denom.add_i(epsilon); - wm.divide(denom, x_grad); + if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) { + Tensor denom = wv.apply(sqrtFloat); + denom.divide_i(sqrtFloat(biasCorrection2)); + denom.add_i(epsilon); + wm.divide(denom, x_grad); +#ifdef ENABLE_FP16 + } else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) { + Tensor denom = wv.apply<_FP16>(sqrtFloat<_FP16>); + denom.divide_i(sqrtFloat(biasCorrection2)); + denom.add_i(epsilon); + wm.divide(denom, x_grad); +#endif + } else { + throw std::runtime_error("Not supported datatype"); + } context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad); } else { - std::function sqrtEps = [epsilon](double f) { - return 1 / (sqrtDouble(f) + epsilon); + auto sqrtEps = [epsilon](T f) -> T { + return 1 / (static_cast(sqrtDouble(f)) + static_cast(epsilon)); }; - x_grad = wv.apply(sqrtEps, x_grad); + if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) + x_grad = wv.apply(sqrtEps, x_grad); +#ifdef ENABLE_FP16 + else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) + x_grad = wv.apply<_FP16>(sqrtEps, x_grad); +#endif + else + throw std::runtime_error("Not supported datatype"); + x_grad.multiply_i(wm); context.applyGradient( getUpdatedLearningRate(context.getIteration(), context.getLearningRate()), diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp index f70ab773a9..f0817c2a75 100644 --- a/nntrainer/optimizers/optimizer_context.cpp +++ b/nntrainer/optimizers/optimizer_context.cpp @@ -36,6 +36,21 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const { return weight->getOptimizerVariableRef(idx); } +/** + * @brief Get the optimizer variable associated to this weight + */ +Tensor & +RunOptimizerContext::getOptimizerMasterVariable(unsigned int idx) const { + return weight->getOptimizerMasterVariableRef(idx); +} + +/** + * @brief Get number of optimizer master variable + */ +int RunOptimizerContext::getNumOptMasterVariable() { + return weight->getNumOptMasterVariable(); +} + /** * @brief Apply the gradient with the given learning rate */ diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h index 6b4b983e35..ca30c36b94 100644 --- a/nntrainer/optimizers/optimizer_context.h +++ b/nntrainer/optimizers/optimizer_context.h @@ -59,6 +59,21 @@ class RunOptimizerContext { */ Tensor &getOptimizerVariable(unsigned int idx) const; + /** + * @brief Get the optimizer Master variable associated to this weight + * + * @param idx Identifier of the associated weight + * @return Tensor& Reference to the optimizer variable + */ + Tensor &getOptimizerMasterVariable(unsigned int idx) const; + + /** + * @brief Get number of the optimizer Master variable + * + * @return number of optimizer master variable + */ + int getNumOptMasterVariable(); + /** * @brief Check if run context is set and is ready to use * diff --git a/test/unittest/layers/unittest_layers_convolution2d.cpp b/test/unittest/layers/unittest_layers_convolution2d.cpp index 724c79079b..92d9c593e7 100644 --- a/test/unittest/layers/unittest_layers_convolution2d.cpp +++ b/test/unittest/layers/unittest_layers_convolution2d.cpp @@ -198,3 +198,185 @@ GTEST_PARAMETER_TEST( conv2d_mb_valid_drop_last, conv2d_sb_no_overlap, conv2d_mb_no_overlap, conv2d_sb_1x1_kernel, conv2d_mb_1x1_kernel, conv2d_sb_dilation, conv2d_mb_dilation, conv2d_sb_same_dilation, conv2d_mb_same_dilation)); + +#ifdef ENABLE_FP16 +auto conv2d_sb_minimum_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2"}, "1:1:4:4", + "conv2d_sb_minimum_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_minimum_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2"}, "3:1:4:4", + "conv2d_mb_minimum_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_remain_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=2", "kernel_size=3,3", "padding=same"}, "1:1:4:4", + "conv2d_sb_same_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_remain_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=2", "kernel_size=3,3", "padding=same"}, "3:1:4:4", + "conv2d_mb_same_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=same", + }, + "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=0,1,0,1", + }, + "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=same", + }, + "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=0,1,0,1", + }, + "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_valid_drop_last_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=valid", + }, + "1:3:7:7", "conv2d_sb_valid_drop_last_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_valid_drop_last_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=valid", + }, + "3:3:7:7", "conv2d_mb_valid_drop_last_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_no_overlap_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2", "stride=3,3"}, "1:2:5:5", + "conv2d_sb_no_overlap_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_no_overlap_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=3", + "kernel_size=2,2", + "stride=3,3", + }, + "3:2:5:5", "conv2d_mb_no_overlap_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_1x1_kernel_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=1,1", "stride=2,2"}, "1:2:5:5", + "conv2d_sb_1x1_kernel_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_1x1_kernel_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=3", + "kernel_size=1,1", + "stride=2,2", + }, + "3:2:5:5", "conv2d_mb_1x1_kernel_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "dilation=2,2", + }, + "1:3:11:11", "conv2d_sb_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "dilation=2,2", + }, + "3:3:11:11", "conv2d_mb_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "padding=same", + "dilation=2,2", + }, + "1:3:11:11", "conv2d_sb_same_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "padding=same", + "dilation=2,2", + }, + "3:3:11:11", "conv2d_mb_same_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +GTEST_PARAMETER_TEST( + Convolution2D16, LayerGoldenTest, + ::testing::Values(conv2d_sb_minimum_w16a16, conv2d_mb_minimum_w16a16, + conv2d_sb_same_remain_w16a16, conv2d_mb_same_remain_w16a16, + conv2d_sb_same_uneven_remain_1_w16a16, + conv2d_sb_same_uneven_remain_2_w16a16, + conv2d_mb_same_uneven_remain_1_w16a16, + conv2d_mb_same_uneven_remain_2_w16a16, + conv2d_sb_valid_drop_last_w16a16, + conv2d_mb_valid_drop_last_w16a16, + conv2d_sb_no_overlap_w16a16, conv2d_mb_no_overlap_w16a16, + conv2d_sb_1x1_kernel_w16a16, conv2d_mb_1x1_kernel_w16a16, + conv2d_sb_dilation_w16a16, conv2d_mb_dilation_w16a16, + conv2d_sb_same_dilation_w16a16, + conv2d_mb_same_dilation_w16a16)); +#endif From 72917b84bdcf1860f944e43a909c1dbf93f849ba Mon Sep 17 00:00:00 2001 From: Jiho Chu Date: Thu, 14 Mar 2024 19:31:50 +0900 Subject: [PATCH 12/14] [Layers] Modify layers for data type It is assumed that activations and weight are fully compotaible, so it's unnecessary to be converted to. input layer and loss layres are different, cause input data and label data is assumed to be always float 32 type now. Signed-off-by: Jiho Chu --- nntrainer/layers/bn_layer.cpp | 12 +- nntrainer/layers/conv2d_layer.cpp | 160 +++++---- nntrainer/layers/layer_context.cpp | 51 +++ nntrainer/layers/layer_context.h | 38 +++ nntrainer/layers/layer_devel.h | 5 + nntrainer/layers/layer_node.cpp | 27 +- nntrainer/layers/layer_node.h | 5 + .../loss/cross_entropy_sigmoid_loss_layer.cpp | 3 + .../loss/cross_entropy_softmax_loss_layer.cpp | 36 +- nntrainer/layers/loss/loss_layer.cpp | 3 + nntrainer/layers/loss/loss_layer.h | 13 + nntrainer/layers/loss/meson.build | 4 +- nntrainer/layers/loss/mse_loss_layer.cpp | 75 ++++- nntrainer/layers/lstm.cpp | 42 ++- nntrainer/layers/lstm.h | 1 - nntrainer/layers/pooling2d_layer.cpp | 310 +++++++++++------- nntrainer/layers/reshape_layer.cpp | 1 + 17 files changed, 570 insertions(+), 216 deletions(-) diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp index 1723ac677f..e978b1ef59 100644 --- a/nntrainer/layers/bn_layer.cpp +++ b/nntrainer/layers/bn_layer.cpp @@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f, bias_decay, "beta", true); + /** + * @note declare weigth dimention with activation datatype + */ + TensorDim w_dim = dim; + w_dim.setDataType(in_dim.getDataType()); + /** * caches the deviation -> input - avg(input) * @todo check if avoiding this storage and adding dependency on input (no @@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { TensorLifespan::ITERATION_LIFESPAN); /** caches the inverse standard deviation */ wt_idx[BNParams::invstd] = - context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false, + context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); /** * Temporary tensor to store the full sized tensors in order to allow batch @@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { * caches variance + epsilon as well. */ wt_idx[BNParams::cvar] = - context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false, + context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); /** * Temporary tensor to store the reduced tensors along the axes_to_reduce. */ wt_idx[BNParams::t_reduced] = - context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE, + context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE, false, TensorLifespan::FORWARD_DERIV_LIFESPAN); } diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp index c059ae9caf..5d9dbc1e19 100644 --- a/nntrainer/layers/conv2d_layer.cpp +++ b/nntrainer/layers/conv2d_layer.cpp @@ -38,7 +38,8 @@ namespace { static TensorDim calcCol2ImOutputDim(const TensorDim &out, const TensorDim &kdim) { - return TensorDim({kdim.getFeatureLen(), out.width() * out.height()}); + return TensorDim({kdim.getFeatureLen(), out.width() * out.height()}, + out.getTensorType()); } /** @@ -56,7 +57,10 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim, const std::array &mstride, const std::array &dilation, Tensor &image) { - auto [pt, pb, pl, pr] = padding; + auto pt = padding[0]; + auto pb = padding[1]; + auto pl = padding[2]; + auto pr = padding[3]; unsigned k_height = kdim.height(); unsigned k_width = kdim.width(); @@ -84,32 +88,48 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim, int h_stride_end = im_eff_height - eff_k_height - pt; int w_stride_end = im_eff_width - eff_k_width - pl; - unsigned col_w = 0; - for (int hs = -pt; hs <= h_stride_end; hs += hstride) { - for (int ws = -pl; ws <= w_stride_end; ws += wstride) { - unsigned col_h = 0; - int patch_height_end = hs + eff_k_height; - int patch_width_end = ws + eff_k_width; - for (unsigned c = 0; c < im_channel; c++) { - for (int h = hs; h < patch_height_end; h += hdilation) { - if (h < 0 || im_height <= h) { - col_h += k_width; - continue; - } - for (int w = ws; w < patch_width_end; w += wdilation) { - if (w < 0 || im_width <= w) { - col_h++; + auto apply_data = [&](T *val) { + unsigned col_w = 0; + for (int hs = -pt; hs <= h_stride_end; hs += hstride) { + for (int ws = -pl; ws <= w_stride_end; ws += wstride) { + unsigned col_h = 0; + int patch_height_end = hs + eff_k_height; + int patch_width_end = ws + eff_k_width; + for (unsigned c = 0; c < im_channel; c++) { + for (int h = hs; h < patch_height_end; h += hdilation) { + if (h < 0 || im_height <= h) { + col_h += k_width; continue; } - - float *val = image.getAddress(0, c, h, w); - *val += col_matrix.getValue(0, 0, col_h, col_w); - col_h++; + for (int w = ws; w < patch_width_end; w += wdilation) { + if (w < 0 || im_width <= w) { + col_h++; + continue; + } + + val = image.getAddress(0, c, h, w); + *val += col_matrix.getValue(0, 0, col_h, col_w); + col_h++; + } } } + col_w++; } - col_w++; } + }; + + if (image.getDataType() == nntrainer::Tdatatype::FP32) { + float val; + apply_data(&val); + } +#ifdef ENABLE_FP16 + else if (image.getDataType() == nntrainer::Tdatatype::FP16) { + _FP16 val; + apply_data(&val); + } +#endif + else { + throw std::runtime_error("Not supported datatype"); } } @@ -179,7 +199,10 @@ static void im2col(const Tensor &in, const TensorDim &kdim, // } */ - auto [pt, pb, pl, pr] = padding; + auto pt = padding[0]; + auto pb = padding[1]; + auto pl = padding[2]; + auto pr = padding[3]; unsigned int channel = in.channel(); int in_height = in.height(); @@ -198,46 +221,62 @@ static void im2col(const Tensor &in, const TensorDim &kdim, unsigned int out_width = (width - eff_k_width) / mstride[1] + 1; out.reshape( - TensorDim({out_height * out_width, in.channel() * k_height * k_width})); - float *out_data = out.getData(); - - int h_stride_end = height - eff_k_height - pt; - int w_stride_end = width - eff_k_width - pl; - - /// get a patch, size of kernel - /// hs is height_strided, ws is width_strided - unsigned int owidth = out.width(); - unsigned int base_im_w = 0; - for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) { - unsigned int base_im_h = 0; - int patch_height_end = eff_k_height + hs; - /// map the patch to a single line looping through channel - for (unsigned int c = 0; c < channel; ++c) { - for (int h = hs; h < patch_height_end; h += dilation[0]) { - if (h < 0 || in_height <= h) { - base_im_h += k_width; - continue; - } - - unsigned int im_w = base_im_w; - for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) { - unsigned int im_h = base_im_h; - int patch_width_end = eff_k_width + ws; + TensorDim({out_height * out_width, in.channel() * k_height * k_width}, + in.getTensorType())); + + auto apply_data = [&](T *out_data) { + int h_stride_end = height - eff_k_height - pt; + int w_stride_end = width - eff_k_width - pl; + + /// get a patch, size of kernel + /// hs is height_strided, ws is width_strided + unsigned int owidth = out.width(); + unsigned int base_im_w = 0; + for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) { + unsigned int base_im_h = 0; + int patch_height_end = eff_k_height + hs; + /// map the patch to a single line looping through channel + for (unsigned int c = 0; c < channel; ++c) { + for (int h = hs; h < patch_height_end; h += dilation[0]) { + if (h < 0 || in_height <= h) { + base_im_h += k_width; + continue; + } - for (int w = ws; w < patch_width_end; w += dilation[1]) { - if (w < 0 || in_width <= w) { + unsigned int im_w = base_im_w; + for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) { + unsigned int im_h = base_im_h; + int patch_width_end = eff_k_width + ws; + + for (int w = ws; w < patch_width_end; w += dilation[1]) { + if (w < 0 || in_width <= w) { + im_h++; + continue; + } + out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w); im_h++; - continue; } - out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w); - im_h++; + im_w++; } - im_w++; + base_im_h += k_width; } - base_im_h += k_width; } + base_im_w += out_width; } - base_im_w += out_width; + }; + + if (out.getDataType() == nntrainer::Tdatatype::FP32) { + float *out_data = out.getData(); + apply_data(out_data); + } +#ifdef ENABLE_FP16 + else if (out.getDataType() == nntrainer::Tdatatype::FP16) { + _FP16 *out_data = out.getData<_FP16>(); + apply_data(out_data); + } +#endif + else { + throw std::runtime_error("Not supported datatype"); } } @@ -279,9 +318,11 @@ void Conv2DLayer::finalize(InitLayerContext &context) { auto &dilation = std::get>(conv_props); - TensorDim kernel_dim = - TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]); - TensorDim bias_dim = TensorDim(1, filter_size, 1, 1); + auto in_t_type = in_dim.getTensorType(); + in_t_type.data_type = context.getWeightDataType(); + TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(), + kernel_size[0], kernel_size[1], in_t_type); + TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type); padding = std::get(conv_props) .compute(in_dim, kernel_dim, {stride[0], stride[1]}, @@ -309,6 +350,7 @@ void Conv2DLayer::finalize(InitLayerContext &context) { out_dim.channel(filter_size); out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1); out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1); + out_dim.setTensorType(in_dim.getTensorType()); context.setOutputDimensions({out_dim}); NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1], diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index 5862e6af14..648eae30ee 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -157,6 +157,16 @@ Tensor &RunLayerContext::getWeight(unsigned int idx) const { return weights[idx]->getVariableRef(); } +/** + * @brief Get the Weight tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight tensor + */ +Tensor *RunLayerContext::getWeightMaster(unsigned int idx) const { + return weights[idx]->getVariableMasterRef(); +} + /** * @brief Get the Weight Gradient tensor object * @@ -195,6 +205,18 @@ Tensor &RunLayerContext::getWeightOptVar(unsigned int idx, return weights[idx]->getOptimizerVariableRef(jdx); } +/** + * @brief Get the Weight Optimizer Variable tensor object + * + * @param idx Identifier of the weight + * @param jdx Identifier of the optimizer variables + * @return Tensor& Reference to the weight optimizer variable tensor + */ +Tensor &RunLayerContext::getWeightOptMasterVar(unsigned int idx, + unsigned int jdx) const { + return weights[idx]->getOptimizerMasterVariableRef(jdx); +} + /** * @brief Get the Number of Weight Optimizer Variable tensor object * @@ -205,6 +227,16 @@ unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const { return weights[idx]->getNumOptVariable(); } +/** + * @brief Get the Number of Weight Optimizer Variable tensor object + * + * @param idx Identifier of the weight + * @return int Number of the weight optimizer variable + */ +unsigned int RunLayerContext::getNumWeightOptMasterVar(unsigned int idx) const { + return weights[idx]->getNumOptMasterVariable(); +} + /** * @brief Get regularization loss for the weight * @@ -344,6 +376,25 @@ Tensor &RunLayerContext::getOutgoingDerivative(unsigned int idx) { return getInputGrad(idx); } +bool RunLayerContext::validateDerivatives() { + auto num_in = getNumInputs(); + auto num_out = getNumOutputs(); + + for (unsigned int i = 0; i < num_in; ++i) { + auto deriv = getIncomingDerivative(i); + if (deriv.checkDataValidation(false) == false) + return false; + } + + for (unsigned int i = 0; i < num_out; ++i) { + auto deriv = getOutgoingDerivative(i); + if (deriv.checkDataValidation(false) == false) + return false; + } + + return true; +} + /** * @brief Get the Tensor object * diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index c68c42f11d..3e2e3d0339 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -474,6 +474,14 @@ class RunLayerContext { */ Tensor &getWeight(unsigned int idx) const; + /** + * @brief Get the Weight master tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight tensor + */ + Tensor *getWeightMaster(unsigned int idx) const; + /** * @brief Get the Weight Gradient tensor object * @@ -501,6 +509,15 @@ class RunLayerContext { */ Tensor &getWeightOptVar(unsigned int idx, unsigned int jdx) const; + /** + * @brief Get the Weight Optimizer Master Variable tensor object + * + * @param idx Identifier of the weight + * @param jdx Identifier of the weight optimizer master variable + * @return Tensor& Reference to the weight optimizer tensor + */ + Tensor &getWeightOptMasterVar(unsigned int idx, unsigned int jdx) const; + /** * @brief Get the Weight name * @@ -611,6 +628,11 @@ class RunLayerContext { */ Tensor &getOutgoingDerivative(unsigned int idx); + /** + * @brief validate input/output derivatives of the layer + */ + bool validateDerivatives(); + /** * @brief Get the Tensor object * @@ -740,6 +762,14 @@ class RunLayerContext { */ unsigned int getNumWeightOptVar(unsigned int idx) const; + /** + * @brief Get the Number of Weight Optimizer Variable tensor object + * + * @param idx Identifier of the weight + * @return unsigned int Number of the weight optimizer variable + */ + unsigned int getNumWeightOptMasterVar(unsigned int idx) const; + /** * @brief Get the number of requested tensors objects * @@ -747,6 +777,14 @@ class RunLayerContext { */ unsigned int getNumTensors() const { return tensors.size(); } + /** + * @brief Set the Weight Optimizer Variable tensor object + * + * @param idx Identifier of the weight + * @param jdx Identifier of the weight optimizer variable + */ + void setWeightOptVars(unsigned int idx, std::vector opts); + /** * @brief Set the batch for the run context * diff --git a/nntrainer/layers/layer_devel.h b/nntrainer/layers/layer_devel.h index 54ce1a0ee9..44a87cc7e9 100644 --- a/nntrainer/layers/layer_devel.h +++ b/nntrainer/layers/layer_devel.h @@ -259,6 +259,11 @@ class Layer { * @return true if supports backwarding, else false */ virtual bool supportBackwarding() const = 0; + + /** + * @brief Set loss scale factor + */ + virtual void setLossScale(float scale) {} }; /// @todo Decide where to put and how to implement(#986) diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp index f41752a4d8..6eb4b279de 100644 --- a/nntrainer/layers/layer_node.cpp +++ b/nntrainer/layers/layer_node.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -465,8 +466,12 @@ void LayerNode::read(std::ifstream &file, bool opt_var) { for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { if (run_context->isGradientLastAccess(i) && getTrainable()) { /// @note read optimizer variables + auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i); for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) { - run_context->getWeightOptVar(i, j).read(file); + if (num_w_opt_m > 0) + run_context->getWeightOptMasterVar(i, j).read(file); + else + run_context->getWeightOptVar(i, j).read(file); } } } @@ -474,7 +479,11 @@ void LayerNode::read(std::ifstream &file, bool opt_var) { for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { /// @note shared weights are only be read at the first acecss if (run_context->isGradientLastAccess(i)) { - run_context->getWeight(i).read(file); + auto w = run_context->getWeightMaster(i); + if (w) + w->read(file); + else + run_context->getWeight(i).read(file); } } } @@ -489,9 +498,13 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const { if (run_context->isGradientLastAccess(i) && getTrainable()) { // @note save optimizer variables if (run_context->weightHasGradient(i)) { + auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i); for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) { - run_context->getWeightOptVar(i, j).save(file); + if (num_w_opt_m > 0) + run_context->getWeightOptMasterVar(i, j).save(file); + else + run_context->getWeightOptVar(i, j).save(file); } } } @@ -500,7 +513,13 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const { // @note shared weights are only be saved at the first access for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { if (run_context->isGradientLastAccess(i)) { - run_context->getWeight(i).save(file); + if (run_context->getNumWeights()) { + auto w = run_context->getWeightMaster(i); + if (w) + w->save(file); + else + run_context->getWeight(i).save(file); + } } } } diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index 3fd2d55b97..6be31f536e 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -900,6 +900,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode { */ bool needsCalcGradient() { return needs_calc_gradient; } + /** + * @brief Set loss scale factor + */ + void setLossScale(float scale) { layer->setLossScale(scale); } + private: /** * @brief Get the Input Layers object diff --git a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp index 60ea113418..feeff2b3d8 100644 --- a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp @@ -61,6 +61,9 @@ void CrossEntropySigmoidLossLayer::calcDerivative(RunLayerContext &context) { Tensor &y = context.getInput(SINGLE_INOUT_IDX); y.apply(ActiFunc::sigmoid, ret_derivative); + + applyLossScale(ret_derivative); + ret_derivative.subtract_i(y2); if (ret_derivative.divide_i(ret_derivative.size()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySigmoidLossLayer::calcDerivative] " diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp index 53854662ae..c181c60b9a 100644 --- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp @@ -30,9 +30,14 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context, Tensor &y = context.getInput(SINGLE_INOUT_IDX); // fill the output - auto dataType = y.getDataType(); - if (dataType == ml::train::TensorDim::DataType::FP32) { - hidden_ = y.apply(ActiFunc::softmax, hidden_); + auto out_type = hidden_.getDataType(); + if (out_type == ml::train::TensorDim::DataType::FP32) { + if (y.getDataType() != out_type) { + Tensor y_ = y.clone(out_type); + hidden_ = y_.apply(ActiFunc::softmax, hidden_); + } else { + hidden_ = y.apply(ActiFunc::softmax, hidden_); + } if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); @@ -43,9 +48,14 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context, // update the loss value LossLayer::updateLoss(context, l); } - } else if (dataType == ml::train::TensorDim::DataType::FP16) { + } else if (out_type == ml::train::TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 - hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_); + if (y.getDataType() != out_type) { + Tensor y_ = y.clone(out_type); + hidden_ = y_.apply(ActiFunc::softmax<_FP16>, hidden_); + } else { + hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_); + } if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); @@ -68,7 +78,8 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) { Tensor &y = context.getInput(SINGLE_INOUT_IDX); auto dataType = y.getDataType(); - Tensor ret = Tensor("ret", y.getFormat(), y.getDataType()); + + Tensor ret(y.getDim()); if (dataType == ml::train::TensorDim::DataType::FP32) { y.apply(ActiFunc::softmax, ret); } else if (dataType == ml::train::TensorDim::DataType::FP16) { @@ -83,7 +94,18 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) { /// operation // TODO: verify y and ret_derivative must not be same as loss layer is not // working in-place - ret.subtract(y2, ret_derivative); + if (ret.getDataType() != y2.getDataType()) { + ret.subtract(y2.clone(ret.getDataType()), ret_derivative); + } else { + ret.subtract(y2, ret_derivative); + } + + /** + * loss scale is applied for mixed precision + * every loss layers need to specify this applying code. + */ + applyLossScale(ret_derivative); + if (ret_derivative.divide_i(ret.batch()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySoftmaxLossLayer::calcDerivative] " "Error when calculating loss"); diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp index ab2ccf8be2..422037b9e9 100644 --- a/nntrainer/layers/loss/loss_layer.cpp +++ b/nntrainer/layers/loss/loss_layer.cpp @@ -15,6 +15,9 @@ #include namespace nntrainer { + +LossLayer::LossLayer() : Layer(), loss_scale(0.0f) {} + void LossLayer::finalize(InitLayerContext &context) { std::vector input_dim = context.getInputDimensions(); std::vector output_dim = input_dim; diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h index 581e9477a8..84a1112864 100644 --- a/nntrainer/layers/loss/loss_layer.h +++ b/nntrainer/layers/loss/loss_layer.h @@ -27,6 +27,11 @@ namespace nntrainer { */ class LossLayer : public Layer { public: + /** + * @brief Constructor of Loss Layer + */ + LossLayer(); + /** * @brief Destructor of Loss Layer */ @@ -47,11 +52,19 @@ class LossLayer : public Layer { */ virtual bool supportBackwarding() const override { return true; } + /** + * @brief Set loss scale factor + */ + virtual void setLossScale(float scale) override { loss_scale = scale; } + +private: /** * @copydoc Layer::requireLabel() */ bool requireLabel() const override { return true; } + float loss_scale; /**< loss scale factor */ + protected: /** * @brief update loss diff --git a/nntrainer/layers/loss/meson.build b/nntrainer/layers/loss/meson.build index 9fccd0290d..8ec9928101 100644 --- a/nntrainer/layers/loss/meson.build +++ b/nntrainer/layers/loss/meson.build @@ -7,7 +7,9 @@ loss_layer_sources = [ 'constant_derivative_loss_layer.cpp' ] -loss_layer_headers = [] +loss_layer_headers = [ + 'loss_layer.h' +] loss_layer_deps = [] diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index ed4390655d..e6b6f068f6 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -11,6 +11,7 @@ * */ +#include "tensor.h" #include #include @@ -32,21 +33,39 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) { .clone(ml::train::TensorDim::DataType::FP32); // hidden_ <- y2 - y; - if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { - Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); - y2.subtract(y, hidden_); - - /** calculate sum of squares normalized by size */ - float l2norm = hidden_.l2norm(); - l2norm *= l2norm / hidden_.size(); - - /** wrap in tensor for update loss */ - Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); - LossLayer::updateLoss(context, l); + auto out_type = hidden_.getDataType(); + if (out_type != y_.getDataType()) { + Tensor y = y_.clone(out_type); + if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { + Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); + y2.subtract(y, hidden_); + + /** calculate sum of squares normalized by size */ + float l2norm = hidden_.l2norm(); + l2norm *= l2norm / hidden_.size(); + + /** wrap in tensor for update loss */ + Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); + LossLayer::updateLoss(context, l); + } + // fill the output + hidden_.fill(y); + } else { + if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { + Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); + y2.subtract(y_, hidden_); + + /** calculate sum of squares normalized by size */ + float l2norm = hidden_.l2norm(); + l2norm *= l2norm / hidden_.size(); + + /** wrap in tensor for update loss */ + Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); + LossLayer::updateLoss(context, l); + } + // fill the output + hidden_.fill(y_); } - - // fill the output - hidden_.fill(y); } void MSELossLayer::calcDerivative(RunLayerContext &context) { @@ -73,9 +92,33 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); - y.subtract(y2, ret_derivative); + const auto &in_type = y.getDataType(); + if (in_type != y2.getDataType()) { + Tensor y2_ = y2.clone(in_type); + y.subtract(y2_, ret_derivative); + } else { + y.subtract(y2, ret_derivative); + } + + applyLossScale(ret_derivative); + float divider = ((float)y.size()) / 2; - if (ret_derivative.divide_i(divider) != ML_ERROR_NONE) { + + /** + * ret_derivative may be eliminated by big divider with fp16 calculation. + * So, it calcuated with larger precision. + */ + int ret; + if (ret_derivative.getDataType() != ml::train::TensorDim::DataType::FP32) { + Tensor ret_derivative_ = + ret_derivative.clone(ml::train::TensorDim::DataType::FP32); + ret = ret_derivative_.divide_i(divider); + ret_derivative.copyData(ret_derivative_); + } else { + ret = ret_derivative.divide_i(divider); + } + + if (ret != ML_ERROR_NONE) { throw std::runtime_error( "[MSELossLayer::calcDerivative] Error when calculating loss"); } diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp index d5f13a1fc5..be313a0aca 100644 --- a/nntrainer/layers/lstm.cpp +++ b/nntrainer/layers/lstm.cpp @@ -509,21 +509,27 @@ void LSTMLayer::finalize(InitLayerContext &context) { } // hidden_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + hidden_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::hidden_state] = context.requestTensor( hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // cell_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim cell_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + cell_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::cell_state] = context.requestTensor( cell_state_dim, "cell_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ] - const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, - weight_tensor_type); + TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, + weight_tensor_type); + ifgo_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::ifgo] = context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); @@ -576,21 +582,27 @@ void LSTMLayer::finalize(InitLayerContext &context) { } // reverse_hidden_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + reverse_hidden_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor( reverse_hidden_state_dim, "reverse_hidden_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + reverse_cell_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor( reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ] - const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, - NUM_GATE * unit, weight_tensor_type); + TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, + weight_tensor_type); + reverse_ifgo_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor( reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); @@ -598,8 +610,10 @@ void LSTMLayer::finalize(InitLayerContext &context) { if (dropout_rate > epsilon) { // dropout_mask_dim = [ batch, 1, time_iteration, unit ] - const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + dropout_mask_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::dropout_mask] = context.requestTensor( dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h index f35fdf8815..a9b2cac7d7 100644 --- a/nntrainer/layers/lstm.h +++ b/nntrainer/layers/lstm.h @@ -99,7 +99,6 @@ class LSTMLayer : public LSTMCore { inline static const std::string type = "lstm"; -private: static constexpr unsigned int NUM_GATE = 4; /** common properties like Unit, IntegrateBias, HiddenStateActivation and diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp index a68e42e8d0..b53ca354f2 100644 --- a/nntrainer/layers/pooling2d_layer.cpp +++ b/nntrainer/layers/pooling2d_layer.cpp @@ -6,6 +6,7 @@ * @date 12 June 2020 * @see https://github.com/nnstreamer/nntrainer * @author Jijoong Moon + * @author Jiho Chu * @bug No known bugs except for NYI items * @brief This is 2 Dimensional Pooling Layer Class for Neural Network * @@ -26,6 +27,13 @@ namespace nntrainer { static constexpr size_t SINGLE_INOUT_IDX = 0; +/** + * @brief help function for Pooling handler + */ +template struct PoolFunc { + typedef std::function Type; +}; + Pooling2DLayer::Pooling2DLayer( const std::array &padding_) : Layer(), @@ -96,6 +104,7 @@ void Pooling2DLayer::finalize(InitLayerContext &context) { out_dim.channel(in_dim.channel()); out_dim.height((eff_in_height - pool_size[0]) / stride[0] + 1); out_dim.width((eff_in_width - pool_size[1]) / stride[1] + 1); + out_dim.setDataType(in_dim.getDataType()); context.setOutputDimensions({out_dim}); /** @@ -111,13 +120,17 @@ void Pooling2DLayer::finalize(InitLayerContext &context) { * // clang-format on */ if (pooling_type == props::PoolingTypeInfo::Enum::global_max) { + auto helper_dim = in_dim; + helper_dim.setDataType(ml::train::TensorDim::DataType::FP32); pool_helper_idx = - context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE, + context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); - pool_helper_size.resize(in_dim.batch() * in_dim.channel()); + pool_helper_size.resize(helper_dim.batch() * helper_dim.channel()); } else { + auto helper_dim = out_dim; + helper_dim.setDataType(ml::train::TensorDim::DataType::FP32); pool_helper_idx = - context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE, + context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); } } @@ -172,15 +185,13 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { unsigned int J, K; result.setZero(); - float *result_data = result.getData(); unsigned int out_map_size = deriv.height() * deriv.width(); unsigned int in_map_size = height * width; - switch (pooling_type) { - case props::PoolingTypeInfo::Enum::max: { + auto apply_max = [&](T *result_data) { const int *iter = pool_helper.getData(); - const float *deriv_data = deriv.getData(); + const T *deriv_data = deriv.getData(); for (unsigned int b = 0; b < batch; ++b) { for (unsigned int c = 0; c < channel; ++c) { for (unsigned int i = 0; i < out_map_size; ++i) { @@ -195,9 +206,9 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { result_data += in_map_size; } } - } break; - case props::PoolingTypeInfo::Enum::global_average: - case props::PoolingTypeInfo::Enum::average: { + }; + + auto apply_average = [&](T *result_data) { int height_stride_end = height - p_height + pt; int width_stride_end = width - p_width + pl; const int *iter = pool_helper.getData(); @@ -207,7 +218,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { for (int j = -pt; j <= height_stride_end; j += stride[0]) { K = 0; for (int k = -pl; k <= width_stride_end; k += stride[1]) { - float del = deriv.getValue(b, i, J, K) / *iter; + T del = deriv.getValue(b, i, J, K) / *iter; int patch_height_end = std::min(static_cast(j + p_height), height); int patch_width_end = @@ -217,7 +228,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { for (int h = start_h; h < patch_height_end; ++h) { for (int w = start_w; w < patch_width_end; ++w) { result.setValue(b, i, h, w, - result.getValue(b, i, h, w) + del); + result.getValue(b, i, h, w) + del); } } iter++; @@ -227,15 +238,16 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { } } } - } break; - case props::PoolingTypeInfo::Enum::global_max: { - const float *deriv_data = deriv.getData(); + }; + + auto apply_global_max = [&](T *result_data) { + const T *deriv_data = deriv.getData(); for (unsigned int b = 0; b < batch; b++) { for (unsigned int c = 0; c < channel; c++) { const int *iter = pool_helper.getData() + pool_helper.getIndex(b, c, 0, 0); unsigned int helper_size = pool_helper_size[b * channel + c]; - float der = *deriv_data / helper_size; + T der = *deriv_data / static_cast(helper_size); for (unsigned int idx = 0; idx < helper_size; idx++) result_data[iter[idx]] += der; @@ -244,7 +256,40 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { result_data += in_map_size; } } - } break; + }; + + switch (pooling_type) { + case props::PoolingTypeInfo::Enum::max: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_max(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_max(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; + case props::PoolingTypeInfo::Enum::global_average: + case props::PoolingTypeInfo::Enum::average: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_average(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_average(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; + case props::PoolingTypeInfo::Enum::global_max: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_global_max(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_global_max(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; default: throw std::runtime_error("Error: Unknown Pooling Type"); } @@ -290,124 +335,167 @@ void Pooling2DLayer::pooling2d(Tensor &in, bool training, Tensor &output, * @param start_w (width index pointing the start of the patch) * @return result value of pooling */ - std::function pool_fn; + PoolFunc::Type pool_fn_fp32; +#ifdef ENABLE_FP16 + PoolFunc<_FP16>::Type pool_fn_fp16; +#endif unsigned int max_idx_count = 0; - switch (pooling_type) { - case props::PoolingTypeInfo::Enum::max: { - pool_fn = [&](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - - float max_val = std::numeric_limits::lowest(); - - int cur_max_idx = -1; - int eff_end_h = std::min(end_h, in_height); - int eff_end_w = std::min(end_w, in_width); - start_w = std::max(0, start_w); - for (int h = std::max(0, start_h); h < eff_end_h; ++h) { - for (int w = start_w; w < eff_end_w; ++w) { - int cur_idx = h * in_width + w; - float val = in_data[cur_idx]; - if (max_val < val) { - max_val = val; - if (training) { - cur_max_idx = cur_idx; - } + + auto pool_fn_max = [&](const T *in_data, int channel_idx, + int start_h, int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + + T max_val = std::numeric_limits::lowest(); + + int cur_max_idx = -1; + int eff_end_h = std::min(end_h, in_height); + int eff_end_w = std::min(end_w, in_width); + start_w = std::max(0, start_w); + for (int h = std::max(0, start_h); h < eff_end_h; ++h) { + for (int w = start_w; w < eff_end_w; ++w) { + int cur_idx = h * in_width + w; + T val = in_data[cur_idx]; + if (max_val < val) { + max_val = val; + if (training) { + cur_max_idx = cur_idx; } } } + } - if (training) { - pool_helper.setValueInt(max_idx_count++, cur_max_idx); - } + if (training) { + pool_helper.setValueInt(max_idx_count++, cur_max_idx); + } - return max_val; - }; - break; - } - case props::PoolingTypeInfo::Enum::global_max: { - pool_fn = [&, this](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - - float max_val = std::numeric_limits::lowest(); - int *helper_data = pool_helper.getData(); - helper_data += channel_idx * in_height * in_width; - - for (int h = start_h; h < end_h; ++h) { - for (int w = start_w; w < end_w; ++w) { - int cur_idx = h * in_width + w; - float val = in_data[cur_idx]; - if (max_val < val) { - max_val = val; - max_idx_count = 0; - } + return max_val; + }; - if (training && max_val == val) { - *(helper_data + max_idx_count++) = cur_idx; - } + auto pool_fn_global_max = [&, this](const T *in_data, + int channel_idx, int start_h, + int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + + T max_val = std::numeric_limits::lowest(); + int *helper_data = pool_helper.getData(); + helper_data += channel_idx * in_height * in_width; + + for (int h = start_h; h < end_h; ++h) { + for (int w = start_w; w < end_w; ++w) { + int cur_idx = h * in_width + w; + T val = in_data[cur_idx]; + if (max_val < val) { + max_val = val; + max_idx_count = 0; } - } - pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count; - return max_val; - }; - break; - } - case props::PoolingTypeInfo::Enum::global_average: - case props::PoolingTypeInfo::Enum::average: { - pool_fn = [&](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - float total = 0.0f; - - int eff_end_h = std::min(end_h, in_height); - int eff_end_w = std::min(end_w, in_width); - int eff_start_h = std::max(0, start_h); - int eff_start_w = std::max(0, start_w); - - int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w); - for (int h = eff_start_h; h < eff_end_h; ++h) { - for (int w = eff_start_w; w < eff_end_w; ++w) { - float val = in_data[h * in_width + w]; - total += val; + if (training && max_val == val) { + *(helper_data + max_idx_count++) = cur_idx; } } + } - if (training) { - pool_helper.setValueInt(max_idx_count++, cnt); + pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count; + return max_val; + }; + + auto pool_fn_average = [&](const T *in_data, int channel_idx, + int start_h, int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + T total = static_cast(0.0f); + + int eff_end_h = std::min(end_h, in_height); + int eff_end_w = std::min(end_w, in_width); + int eff_start_h = std::max(0, start_h); + int eff_start_w = std::max(0, start_w); + + int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w); + for (int h = eff_start_h; h < eff_end_h; ++h) { + for (int w = eff_start_w; w < eff_end_w; ++w) { + T val = in_data[h * in_width + w]; + total += val; } - return total / cnt; - }; + } + + if (training) { + pool_helper.setValueInt(max_idx_count++, cnt); + } + return total / cnt; + }; + + switch (pooling_type) { + case props::PoolingTypeInfo::Enum::max: + pool_fn_fp32 = pool_fn_max; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_max; +#endif + break; + case props::PoolingTypeInfo::Enum::global_max: + pool_fn_fp32 = pool_fn_global_max; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_global_max; +#endif + break; + case props::PoolingTypeInfo::Enum::global_average: + case props::PoolingTypeInfo::Enum::average: + pool_fn_fp32 = pool_fn_average; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_average; +#endif break; - } case props::PoolingTypeInfo::Enum::unknown: default: throw std::invalid_argument("unknown pooling type given"); break; } - const float *in_data = in.getData(); - float *out_data = output.getData(); - - unsigned int map_size = in_height * in_width; - - int height_stride_end = height - patch_height - pt; - int width_stride_end = width - patch_width - pl; - for (unsigned int i = 0; i < channel; ++i) { - const float *in_data_channel_sliced = in_data + i * map_size; - for (int j = -pt; j <= height_stride_end; j += stride[0]) { - for (int k = -pl; k <= width_stride_end; k += stride[1]) { - float pool_value = pool_fn(in_data_channel_sliced, i, j, k); - *out_data = pool_value; - out_data++; + if (in.getDataType() == ml::train::TensorDim::DataType::FP32) { + const float *in_data = in.getData(); + float *out_data = output.getData(); + + unsigned int map_size = in_height * in_width; + + int height_stride_end = height - patch_height - pt; + int width_stride_end = width - patch_width - pl; + for (unsigned int i = 0; i < channel; ++i) { + const float *in_data_channel_sliced = in_data + i * map_size; + for (int j = -pt; j <= height_stride_end; j += stride[0]) { + for (int k = -pl; k <= width_stride_end; k += stride[1]) { + float pool_value = pool_fn_fp32(in_data_channel_sliced, i, j, k); + *out_data = pool_value; + out_data++; + } + } + } + } +#ifdef ENABLE_FP16 + else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) { + const _FP16 *in_data = in.getData<_FP16>(); + _FP16 *out_data = output.getData<_FP16>(); + + unsigned int map_size = in_height * in_width; + + int height_stride_end = height - patch_height - pt; + int width_stride_end = width - patch_width - pl; + for (unsigned int i = 0; i < channel; ++i) { + const _FP16 *in_data_channel_sliced = in_data + i * map_size; + for (int j = -pt; j <= height_stride_end; j += stride[0]) { + for (int k = -pl; k <= width_stride_end; k += stride[1]) { + _FP16 pool_value = pool_fn_fp16(in_data_channel_sliced, i, j, k); + *out_data = pool_value; + out_data++; + } } } } +#endif + else { + throw std::runtime_error("Not supported datatype"); + } } void Pooling2DLayer::setBatch(RunLayerContext &context, unsigned int batch) { diff --git a/nntrainer/layers/reshape_layer.cpp b/nntrainer/layers/reshape_layer.cpp index 0f82d84f3a..07564b3970 100644 --- a/nntrainer/layers/reshape_layer.cpp +++ b/nntrainer/layers/reshape_layer.cpp @@ -42,6 +42,7 @@ void ReshapeLayer::finalize(InitLayerContext &context) { } out_dim.batch(in_dim.batch()); + out_dim.setDataType(in_dim.getDataType()); context.setOutputDimensions({out_dim}); } From 40a523378b7a83239ac90c337c0319a181558440 Mon Sep 17 00:00:00 2001 From: Donghak PARK Date: Tue, 7 May 2024 16:27:54 +0900 Subject: [PATCH 13/14] [Mixed] Mixed Precision Layer update This PR is to update the mixed precision layer. - integrate #2568 & #2455 - will update more test **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Donghak PARK --- nntrainer/layers/layer_context.cpp | 51 ------------- nntrainer/layers/layer_context.h | 38 ---------- nntrainer/layers/layer_node.cpp | 27 +------ nntrainer/layers/layer_node.h | 5 -- .../loss/cross_entropy_sigmoid_loss_layer.cpp | 3 - .../loss/cross_entropy_softmax_loss_layer.cpp | 36 ++------- nntrainer/layers/loss/loss_layer.cpp | 3 - nntrainer/layers/loss/loss_layer.h | 13 ---- nntrainer/layers/loss/meson.build | 4 +- nntrainer/layers/loss/mse_loss_layer.cpp | 75 ++++--------------- nntrainer/optimizers/adam.cpp | 56 +++----------- nntrainer/optimizers/optimizer_context.cpp | 15 ---- nntrainer/optimizers/optimizer_context.h | 15 ---- 13 files changed, 39 insertions(+), 302 deletions(-) diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index 648eae30ee..5862e6af14 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -157,16 +157,6 @@ Tensor &RunLayerContext::getWeight(unsigned int idx) const { return weights[idx]->getVariableRef(); } -/** - * @brief Get the Weight tensor object - * - * @param idx Identifier of the weight - * @return Tensor& Reference to the weight tensor - */ -Tensor *RunLayerContext::getWeightMaster(unsigned int idx) const { - return weights[idx]->getVariableMasterRef(); -} - /** * @brief Get the Weight Gradient tensor object * @@ -205,18 +195,6 @@ Tensor &RunLayerContext::getWeightOptVar(unsigned int idx, return weights[idx]->getOptimizerVariableRef(jdx); } -/** - * @brief Get the Weight Optimizer Variable tensor object - * - * @param idx Identifier of the weight - * @param jdx Identifier of the optimizer variables - * @return Tensor& Reference to the weight optimizer variable tensor - */ -Tensor &RunLayerContext::getWeightOptMasterVar(unsigned int idx, - unsigned int jdx) const { - return weights[idx]->getOptimizerMasterVariableRef(jdx); -} - /** * @brief Get the Number of Weight Optimizer Variable tensor object * @@ -227,16 +205,6 @@ unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const { return weights[idx]->getNumOptVariable(); } -/** - * @brief Get the Number of Weight Optimizer Variable tensor object - * - * @param idx Identifier of the weight - * @return int Number of the weight optimizer variable - */ -unsigned int RunLayerContext::getNumWeightOptMasterVar(unsigned int idx) const { - return weights[idx]->getNumOptMasterVariable(); -} - /** * @brief Get regularization loss for the weight * @@ -376,25 +344,6 @@ Tensor &RunLayerContext::getOutgoingDerivative(unsigned int idx) { return getInputGrad(idx); } -bool RunLayerContext::validateDerivatives() { - auto num_in = getNumInputs(); - auto num_out = getNumOutputs(); - - for (unsigned int i = 0; i < num_in; ++i) { - auto deriv = getIncomingDerivative(i); - if (deriv.checkDataValidation(false) == false) - return false; - } - - for (unsigned int i = 0; i < num_out; ++i) { - auto deriv = getOutgoingDerivative(i); - if (deriv.checkDataValidation(false) == false) - return false; - } - - return true; -} - /** * @brief Get the Tensor object * diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index 3e2e3d0339..c68c42f11d 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -474,14 +474,6 @@ class RunLayerContext { */ Tensor &getWeight(unsigned int idx) const; - /** - * @brief Get the Weight master tensor object - * - * @param idx Identifier of the weight - * @return Tensor& Reference to the weight tensor - */ - Tensor *getWeightMaster(unsigned int idx) const; - /** * @brief Get the Weight Gradient tensor object * @@ -509,15 +501,6 @@ class RunLayerContext { */ Tensor &getWeightOptVar(unsigned int idx, unsigned int jdx) const; - /** - * @brief Get the Weight Optimizer Master Variable tensor object - * - * @param idx Identifier of the weight - * @param jdx Identifier of the weight optimizer master variable - * @return Tensor& Reference to the weight optimizer tensor - */ - Tensor &getWeightOptMasterVar(unsigned int idx, unsigned int jdx) const; - /** * @brief Get the Weight name * @@ -628,11 +611,6 @@ class RunLayerContext { */ Tensor &getOutgoingDerivative(unsigned int idx); - /** - * @brief validate input/output derivatives of the layer - */ - bool validateDerivatives(); - /** * @brief Get the Tensor object * @@ -762,14 +740,6 @@ class RunLayerContext { */ unsigned int getNumWeightOptVar(unsigned int idx) const; - /** - * @brief Get the Number of Weight Optimizer Variable tensor object - * - * @param idx Identifier of the weight - * @return unsigned int Number of the weight optimizer variable - */ - unsigned int getNumWeightOptMasterVar(unsigned int idx) const; - /** * @brief Get the number of requested tensors objects * @@ -777,14 +747,6 @@ class RunLayerContext { */ unsigned int getNumTensors() const { return tensors.size(); } - /** - * @brief Set the Weight Optimizer Variable tensor object - * - * @param idx Identifier of the weight - * @param jdx Identifier of the weight optimizer variable - */ - void setWeightOptVars(unsigned int idx, std::vector opts); - /** * @brief Set the batch for the run context * diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp index 6eb4b279de..f41752a4d8 100644 --- a/nntrainer/layers/layer_node.cpp +++ b/nntrainer/layers/layer_node.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -466,12 +465,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var) { for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { if (run_context->isGradientLastAccess(i) && getTrainable()) { /// @note read optimizer variables - auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i); for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) { - if (num_w_opt_m > 0) - run_context->getWeightOptMasterVar(i, j).read(file); - else - run_context->getWeightOptVar(i, j).read(file); + run_context->getWeightOptVar(i, j).read(file); } } } @@ -479,11 +474,7 @@ void LayerNode::read(std::ifstream &file, bool opt_var) { for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { /// @note shared weights are only be read at the first acecss if (run_context->isGradientLastAccess(i)) { - auto w = run_context->getWeightMaster(i); - if (w) - w->read(file); - else - run_context->getWeight(i).read(file); + run_context->getWeight(i).read(file); } } } @@ -498,13 +489,9 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const { if (run_context->isGradientLastAccess(i) && getTrainable()) { // @note save optimizer variables if (run_context->weightHasGradient(i)) { - auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i); for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) { - if (num_w_opt_m > 0) - run_context->getWeightOptMasterVar(i, j).save(file); - else - run_context->getWeightOptVar(i, j).save(file); + run_context->getWeightOptVar(i, j).save(file); } } } @@ -513,13 +500,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const { // @note shared weights are only be saved at the first access for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { if (run_context->isGradientLastAccess(i)) { - if (run_context->getNumWeights()) { - auto w = run_context->getWeightMaster(i); - if (w) - w->save(file); - else - run_context->getWeight(i).save(file); - } + run_context->getWeight(i).save(file); } } } diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index 6be31f536e..3fd2d55b97 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -900,11 +900,6 @@ class LayerNode final : public ml::train::Layer, public GraphNode { */ bool needsCalcGradient() { return needs_calc_gradient; } - /** - * @brief Set loss scale factor - */ - void setLossScale(float scale) { layer->setLossScale(scale); } - private: /** * @brief Get the Input Layers object diff --git a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp index feeff2b3d8..60ea113418 100644 --- a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp @@ -61,9 +61,6 @@ void CrossEntropySigmoidLossLayer::calcDerivative(RunLayerContext &context) { Tensor &y = context.getInput(SINGLE_INOUT_IDX); y.apply(ActiFunc::sigmoid, ret_derivative); - - applyLossScale(ret_derivative); - ret_derivative.subtract_i(y2); if (ret_derivative.divide_i(ret_derivative.size()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySigmoidLossLayer::calcDerivative] " diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp index c181c60b9a..53854662ae 100644 --- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp @@ -30,14 +30,9 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context, Tensor &y = context.getInput(SINGLE_INOUT_IDX); // fill the output - auto out_type = hidden_.getDataType(); - if (out_type == ml::train::TensorDim::DataType::FP32) { - if (y.getDataType() != out_type) { - Tensor y_ = y.clone(out_type); - hidden_ = y_.apply(ActiFunc::softmax, hidden_); - } else { - hidden_ = y.apply(ActiFunc::softmax, hidden_); - } + auto dataType = y.getDataType(); + if (dataType == ml::train::TensorDim::DataType::FP32) { + hidden_ = y.apply(ActiFunc::softmax, hidden_); if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); @@ -48,14 +43,9 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context, // update the loss value LossLayer::updateLoss(context, l); } - } else if (out_type == ml::train::TensorDim::DataType::FP16) { + } else if (dataType == ml::train::TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 - if (y.getDataType() != out_type) { - Tensor y_ = y.clone(out_type); - hidden_ = y_.apply(ActiFunc::softmax<_FP16>, hidden_); - } else { - hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_); - } + hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_); if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); @@ -78,8 +68,7 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) { Tensor &y = context.getInput(SINGLE_INOUT_IDX); auto dataType = y.getDataType(); - - Tensor ret(y.getDim()); + Tensor ret = Tensor("ret", y.getFormat(), y.getDataType()); if (dataType == ml::train::TensorDim::DataType::FP32) { y.apply(ActiFunc::softmax, ret); } else if (dataType == ml::train::TensorDim::DataType::FP16) { @@ -94,18 +83,7 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) { /// operation // TODO: verify y and ret_derivative must not be same as loss layer is not // working in-place - if (ret.getDataType() != y2.getDataType()) { - ret.subtract(y2.clone(ret.getDataType()), ret_derivative); - } else { - ret.subtract(y2, ret_derivative); - } - - /** - * loss scale is applied for mixed precision - * every loss layers need to specify this applying code. - */ - applyLossScale(ret_derivative); - + ret.subtract(y2, ret_derivative); if (ret_derivative.divide_i(ret.batch()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySoftmaxLossLayer::calcDerivative] " "Error when calculating loss"); diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp index 422037b9e9..ab2ccf8be2 100644 --- a/nntrainer/layers/loss/loss_layer.cpp +++ b/nntrainer/layers/loss/loss_layer.cpp @@ -15,9 +15,6 @@ #include namespace nntrainer { - -LossLayer::LossLayer() : Layer(), loss_scale(0.0f) {} - void LossLayer::finalize(InitLayerContext &context) { std::vector input_dim = context.getInputDimensions(); std::vector output_dim = input_dim; diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h index 84a1112864..581e9477a8 100644 --- a/nntrainer/layers/loss/loss_layer.h +++ b/nntrainer/layers/loss/loss_layer.h @@ -27,11 +27,6 @@ namespace nntrainer { */ class LossLayer : public Layer { public: - /** - * @brief Constructor of Loss Layer - */ - LossLayer(); - /** * @brief Destructor of Loss Layer */ @@ -52,19 +47,11 @@ class LossLayer : public Layer { */ virtual bool supportBackwarding() const override { return true; } - /** - * @brief Set loss scale factor - */ - virtual void setLossScale(float scale) override { loss_scale = scale; } - -private: /** * @copydoc Layer::requireLabel() */ bool requireLabel() const override { return true; } - float loss_scale; /**< loss scale factor */ - protected: /** * @brief update loss diff --git a/nntrainer/layers/loss/meson.build b/nntrainer/layers/loss/meson.build index 8ec9928101..9fccd0290d 100644 --- a/nntrainer/layers/loss/meson.build +++ b/nntrainer/layers/loss/meson.build @@ -7,9 +7,7 @@ loss_layer_sources = [ 'constant_derivative_loss_layer.cpp' ] -loss_layer_headers = [ - 'loss_layer.h' -] +loss_layer_headers = [] loss_layer_deps = [] diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index e6b6f068f6..ed4390655d 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -11,7 +11,6 @@ * */ -#include "tensor.h" #include #include @@ -33,39 +32,21 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) { .clone(ml::train::TensorDim::DataType::FP32); // hidden_ <- y2 - y; - auto out_type = hidden_.getDataType(); - if (out_type != y_.getDataType()) { - Tensor y = y_.clone(out_type); - if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { - Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); - y2.subtract(y, hidden_); - - /** calculate sum of squares normalized by size */ - float l2norm = hidden_.l2norm(); - l2norm *= l2norm / hidden_.size(); - - /** wrap in tensor for update loss */ - Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); - LossLayer::updateLoss(context, l); - } - // fill the output - hidden_.fill(y); - } else { - if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { - Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); - y2.subtract(y_, hidden_); - - /** calculate sum of squares normalized by size */ - float l2norm = hidden_.l2norm(); - l2norm *= l2norm / hidden_.size(); - - /** wrap in tensor for update loss */ - Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); - LossLayer::updateLoss(context, l); - } - // fill the output - hidden_.fill(y_); + if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { + Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); + y2.subtract(y, hidden_); + + /** calculate sum of squares normalized by size */ + float l2norm = hidden_.l2norm(); + l2norm *= l2norm / hidden_.size(); + + /** wrap in tensor for update loss */ + Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); + LossLayer::updateLoss(context, l); } + + // fill the output + hidden_.fill(y); } void MSELossLayer::calcDerivative(RunLayerContext &context) { @@ -92,33 +73,9 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); - const auto &in_type = y.getDataType(); - if (in_type != y2.getDataType()) { - Tensor y2_ = y2.clone(in_type); - y.subtract(y2_, ret_derivative); - } else { - y.subtract(y2, ret_derivative); - } - - applyLossScale(ret_derivative); - + y.subtract(y2, ret_derivative); float divider = ((float)y.size()) / 2; - - /** - * ret_derivative may be eliminated by big divider with fp16 calculation. - * So, it calcuated with larger precision. - */ - int ret; - if (ret_derivative.getDataType() != ml::train::TensorDim::DataType::FP32) { - Tensor ret_derivative_ = - ret_derivative.clone(ml::train::TensorDim::DataType::FP32); - ret = ret_derivative_.divide_i(divider); - ret_derivative.copyData(ret_derivative_); - } else { - ret = ret_derivative.divide_i(divider); - } - - if (ret != ML_ERROR_NONE) { + if (ret_derivative.divide_i(divider) != ML_ERROR_NONE) { throw std::runtime_error( "[MSELossLayer::calcDerivative] Error when calculating loss"); } diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp index e2d611b7fd..530e7fdf31 100644 --- a/nntrainer/optimizers/adam.cpp +++ b/nntrainer/optimizers/adam.cpp @@ -95,60 +95,26 @@ void Adam::applyGradient(RunOptimizerContext &context) { Tensor &wm = context.getOptimizerVariable(AdamParams::wm); Tensor &wv = context.getOptimizerVariable(AdamParams::wv); - if (context.getNumOptMasterVariable() != 0) { - Tensor &wm_m = context.getOptimizerMasterVariable(AdamParams::wm); - Tensor &wv_m = context.getOptimizerMasterVariable(AdamParams::wv); - Tensor x_grad_ = x_grad.clone(wm_m.getDataType()); + wm.multiply_i(beta1); + wm.add_i(x_grad, 1.0f - beta1); - wm_m.multiply_i(beta1); - wm_m.add_i(x_grad_, 1.0f - beta1); - - wv_m.multiply_i(beta2); - wv_m.add_i(x_grad_.multiply(x_grad_), 1.0f - beta2); - - wm.copyData(wm_m); - wv.copyData(wv_m); - } else { - wm.multiply_i(beta1); - wm.add_i(x_grad, 1.0f - beta1); - - wv.multiply_i(beta2); - wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2); - } + wv.multiply_i(beta2); + wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2); if (torch_ref) { - if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) { - Tensor denom = wv.apply(sqrtFloat); - denom.divide_i(sqrtFloat(biasCorrection2)); - denom.add_i(epsilon); - wm.divide(denom, x_grad); -#ifdef ENABLE_FP16 - } else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) { - Tensor denom = wv.apply<_FP16>(sqrtFloat<_FP16>); - denom.divide_i(sqrtFloat(biasCorrection2)); - denom.add_i(epsilon); - wm.divide(denom, x_grad); -#endif - } else { - throw std::runtime_error("Not supported datatype"); - } + Tensor denom = wv.apply(sqrtFloat); + denom.divide_i(sqrtFloat(biasCorrection2)); + denom.add_i(epsilon); + wm.divide(denom, x_grad); context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad); } else { - auto sqrtEps = [epsilon](T f) -> T { - return 1 / (static_cast(sqrtDouble(f)) + static_cast(epsilon)); + std::function sqrtEps = [epsilon](double f) { + return 1 / (sqrtDouble(f) + epsilon); }; - if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) - x_grad = wv.apply(sqrtEps, x_grad); -#ifdef ENABLE_FP16 - else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) - x_grad = wv.apply<_FP16>(sqrtEps, x_grad); -#endif - else - throw std::runtime_error("Not supported datatype"); - + x_grad = wv.apply(sqrtEps, x_grad); x_grad.multiply_i(wm); context.applyGradient( getUpdatedLearningRate(context.getIteration(), context.getLearningRate()), diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp index f0817c2a75..f70ab773a9 100644 --- a/nntrainer/optimizers/optimizer_context.cpp +++ b/nntrainer/optimizers/optimizer_context.cpp @@ -36,21 +36,6 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const { return weight->getOptimizerVariableRef(idx); } -/** - * @brief Get the optimizer variable associated to this weight - */ -Tensor & -RunOptimizerContext::getOptimizerMasterVariable(unsigned int idx) const { - return weight->getOptimizerMasterVariableRef(idx); -} - -/** - * @brief Get number of optimizer master variable - */ -int RunOptimizerContext::getNumOptMasterVariable() { - return weight->getNumOptMasterVariable(); -} - /** * @brief Apply the gradient with the given learning rate */ diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h index ca30c36b94..6b4b983e35 100644 --- a/nntrainer/optimizers/optimizer_context.h +++ b/nntrainer/optimizers/optimizer_context.h @@ -59,21 +59,6 @@ class RunOptimizerContext { */ Tensor &getOptimizerVariable(unsigned int idx) const; - /** - * @brief Get the optimizer Master variable associated to this weight - * - * @param idx Identifier of the associated weight - * @return Tensor& Reference to the optimizer variable - */ - Tensor &getOptimizerMasterVariable(unsigned int idx) const; - - /** - * @brief Get number of the optimizer Master variable - * - * @return number of optimizer master variable - */ - int getNumOptMasterVariable(); - /** * @brief Check if run context is set and is ready to use * From edd5778f9fc0e4188da3e5b4efd67ddced0fde3a Mon Sep 17 00:00:00 2001 From: Donghak PARK Date: Fri, 17 May 2024 18:35:25 +0900 Subject: [PATCH 14/14] [unittest] Add mixed precision unit test Add 2 case of mixed precision unit test - case1 : FC-FC-FC - case2 : Flatten - FC i will add more mixed precision case (conv, lstm, etc..) **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Donghak PARK --- .../unittest_models_mixed_precision.cpp | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp index becf11ff44..afed06960a 100644 --- a/test/unittest/models/unittest_models_mixed_precision.cpp +++ b/test/unittest/models/unittest_models_mixed_precision.cpp @@ -41,11 +41,52 @@ static std::unique_ptr fc_mixed_training() { return nn; } +static std::unique_ptr multi_fc_mixed_training() { + std::unique_ptr nn(new NeuralNetwork()); + nn->setProperty( + {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"}); + + auto graph = makeGraph({ + {"input", {"name=in", "input_shape=1:28:28"}}, + {"Fully_connected", {"name=fc1", "input_layers=in", "unit=100"}}, + {"Fully_connected", {"name=fc2", "input_layers=fc1", "unit=50"}}, + {"Fully_connected", {"name=fc3", "input_layers=fc2", "unit=10"}}, + {"mse", {"name=loss", "input_layers=fc3"}}, + }); + for (auto &node : graph) { + nn->addLayer(node); + } + nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate=0.1"})); + return nn; +} + +static std::unique_ptr flatten_fc_mixed_training() { + std::unique_ptr nn(new NeuralNetwork()); + nn->setProperty( + {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"}); + + auto graph = makeGraph({ + {"input", {"name=in", "input_shape=1:28:28"}}, + {"flatten", {"name=flatten", "input_layers=in"}}, + {"Fully_connected", {"name=fc", "input_layers=flatten", "unit=10"}}, + {"mse", {"name=loss", "input_layers=fc"}}, + }); + for (auto &node : graph) { + nn->addLayer(node); + } + nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate=0.1"})); + return nn; +} + GTEST_PARAMETER_TEST( MixedPrecision, nntrainerModelTest, ::testing::ValuesIn({ mkModelTc_V2(fc_mixed_training, "fc_mixed_training", ModelTestOption::NO_THROW_RUN_V2), + mkModelTc_V2(multi_fc_mixed_training, "multi_fc_mixed_training", + ModelTestOption::NO_THROW_RUN_V2), + mkModelTc_V2(flatten_fc_mixed_training, "flatten_fc_mixed_training", + ModelTestOption::NO_THROW_RUN_V2), /** ModelTestOption::ALL_V2), * Disabled for now to check */