From 21b7aacef7b8d5fc4805637c8b95572eb6c70c73 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Thu, 2 May 2024 20:43:57 +0900
Subject: [PATCH 01/14] [ Weight ] Add Var32 Tensor in Weight.

We will add Var32 Tensor if the Variable Weight is not Full
precision (FP32). This eables the Weight Update with full precision
and only Apply Gradient Process ueses this Tensor. Therefore, the
lifespan of this tensor should be "ApplyGradient".

. Modify TensorPool to generate Weigth considering Mixed Precsion.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/network_graph.cpp | 13 ++++-
 nntrainer/layers/layer_node.h     |  1 +
 nntrainer/tensor/manager.cpp      | 31 ++++++++++--
 nntrainer/tensor/manager.h        |  2 +-
 nntrainer/tensor/weight.cpp       | 83 +++++++++++++++++++++++++++++++
 nntrainer/tensor/weight.h         | 50 ++++++++++++-------
 6 files changed, 154 insertions(+), 26 deletions(-)
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 2d4cfdc769..370c18964f 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -1557,7 +1557,18 @@ void NetworkGraph::requestOptimizerVariable(
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
         dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
-        w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+        w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+        Tensor::Initializer::ZEROS));
+
+      if (dim.getDataType() != ml::train::TensorDim::DataType::FP32) {
+        for (auto &dim : dims)
+          dim.setDataType(ml::train::TensorDim::DataType::FP32);
+        w->setOptimizerVariables32(
+          tensor_manager->requestWeightOptimizerVariables(
+            dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
+            w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+            Tensor::Initializer::ZEROS));
+      }
     }
   }
 }
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 93e7ac7069..f8e5b7c4e9 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
   const std::vector<TensorDim> getOutputDimensions() const;
   /**
    * @brief Get the Weight object
+   * currently, only unittest uses this func.
    *
    * @param idx Identifier of the weight
    * @return Weight& Reference to the weight
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 9a0d235ba9..572bd217cf 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -414,7 +414,7 @@ std::vector<Weight *> Manager::requestWeights(
       // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     }
 
-    Tensor *var = nullptr, *grad = nullptr;
+    Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr;
     bool is_dependent = !shared_names.empty();
     if (is_dependent) {
       /// shared_name is used and the orignal name is discarded
@@ -431,6 +431,17 @@ std::vector<Weight *> Manager::requestWeights(
         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
                                            dim_g, grad_exec_order, grad_ls,
                                            Tensor::Initializer::ZEROS);
+
+        if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+          TensorDim var32_dim(dim_v);
+          var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+          std::vector<unsigned int> var32_exec_order;
+          var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+
+          var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim,
+                                              var32_exec_order, var_ls,
+                                              Tensor::Initializer::ZEROS);
+        }
       }
     } else {
       /** case requesting fresh weights */
@@ -448,11 +459,21 @@ std::vector<Weight *> Manager::requestWeights(
         grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
                                    grad_exec_order, grad_ls,
                                    Tensor::Initializer::ZEROS, is_wgrad);
+        if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+          TensorDim var32_dim(dim_v);
+          var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+          std::vector<unsigned int> var32_exec_order;
+          var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+          var32 =
+            weight_pool.request(name + ":var32", var32_dim, var32_exec_order,
+                                var_ls, Tensor::Initializer::ZEROS);
+        }
       }
     }
 
-    weights_v2.emplace_back(std::make_unique<Weight>(
-      var, grad, w_reg, w_reg_const, decay, is_dependent, clip_by_global_norm));
+    weights_v2.emplace_back(
+      std::make_unique<Weight>(var, grad, var32, w_reg, w_reg_const, decay,
+                               is_dependent, clip_by_global_norm));
   }
 
   std::transform(weights_v2.begin() + current_size, weights_v2.end(),
@@ -668,7 +689,7 @@ bool Manager::isSecondLastAccess(const std::string &name,
  */
 std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   const std::vector<TensorDim> &dims, const std::string &name,
-  const TensorLifespan &lifespan, bool is_grad_clip,
+  const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_precision,
   Tensor::Initializer initializer) {
 
   std::vector<Tensor *> ret;
@@ -676,7 +697,7 @@ std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
 
   std::vector<unsigned int> exec;
   exec.reserve(1);
-  if (is_grad_clip) {
+  if (is_grad_clip || is_mixed_precision) {
     exec.emplace_back(TensorPool::PERSIST_END_ORDER);
   } else {
     exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second);
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index ab1c018153..1fa810a35c 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -224,7 +224,7 @@ class Manager {
    */
   std::vector<Tensor *> requestWeightOptimizerVariables(
     const std::vector<TensorDim> &dims, const std::string &name,
-    const TensorLifespan &lifespan, bool is_grad_clip,
+    const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_type,
     Tensor::Initializer initializer = Tensor::Initializer::NONE);
 
   /**
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index f98c8c8356..f86c327842 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -34,6 +34,28 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
+
+  std::string var32_suffix = ":fp32";
+  std::string var32_name = name + var32_suffix;
+
+  /**
+   * @note We assume if the Weight Data Type is not FP32, then FP32 Weight is
+   * necessary to maintain the accuracy.
+   * We could think it can be other data type and if there is the case to
+   * support other data type, then the code below needs to be udpated.
+   *
+   * Also, the loss_scale is not used in Weight but leave as it is for later
+   * usage.
+   */
+
+  if (train && dim.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    TensorDim var32_dim(dim);
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+
+    var32 = std::make_shared<Tensor>(var32_dim, alloc_now_, init, var32_name);
+  } else {
+    var32 = std::make_shared<Tensor>(var32_name);
+  }
 }
 
 Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
@@ -52,6 +74,67 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
+
+  std::string var32_suffix = ":fp32";
+  std::string var32_name = name + var32_suffix;
+
+  if (train && dim_v.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    TensorDim var32_dim(dim_v);
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+    std::string var32_suffix = ":fp32";
+    std::string var32_name = name + var32_suffix;
+
+    var32 = std::make_shared<Tensor>(var32_dim, alloc_now_, init, var32_name);
+  } else {
+    var32 = std::make_shared<Tensor>(var32_name);
+  }
+}
+
+Weight::Weight(const Tensor &v, const Tensor &g, const std::string &n,
+               bool is_dependent, unsigned int output_axis_) :
+  Var_Grad(v, g, n, is_dependent),
+  regularizer(WeightRegularizer::NONE),
+  regularizer_constant(1.0f),
+  decay(0.0f),
+  clip_by_global_norm(0.0f),
+  output_axis(output_axis_),
+  loss_scale(0.0) {
+
+  std::string var32_suffix = ":fp32";
+  std::string var32_name = n + var32_suffix;
+
+  /**
+   * @note We assume here that Weight is created with variable and gradient
+   * tensor. It is not copy or clone and, therefore, we do need create var32 if
+   * it is trainable. For now, We haven't seen the case create wieght with var,
+   * grad and var32. But we will add weight constructor if there is the cases.
+   */
+
+  if (!g.empty() && v.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    TensorDim var32_dim(v.getDim());
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+
+    var32 = std::make_shared<Tensor>(var32_dim, true, Tensor::Initializer::NONE,
+                                     var32_name);
+  } else {
+    var32 = std::make_shared<Tensor>(var32_name);
+  }
+}
+
+Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg,
+               const float reg_const, const float decay, bool is_dependent,
+               const float max_norm, unsigned int output_axis_,
+               float loss_scale_) :
+  Var_Grad(v, g, is_dependent),
+  regularizer(reg),
+  regularizer_constant(reg_const),
+  decay(decay),
+  clip_by_global_norm(max_norm),
+  output_axis(output_axis_),
+  loss_scale(loss_scale_),
+  var32(std::shared_ptr<Tensor>(v32, [](void *) {})) {
+  if (!v32)
+    var32 = std::make_shared<Tensor>();
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 552f6d5739..3a81d1f58e 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -124,34 +124,22 @@ class Weight : public Var_Grad {
    * if the owner of these tensors free the tensors.
    */
   explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "",
-                  bool is_dependent = false, unsigned int output_axis_ = 3) :
-    Var_Grad(v, g, n, is_dependent),
-    regularizer(WeightRegularizer::NONE),
-    regularizer_constant(1.0f),
-    decay(0.0f),
-    clip_by_global_norm(0.0f),
-    output_axis(output_axis_),
-    loss_scale(0.0) {}
+                  bool is_dependent = false, unsigned int output_axis_ = 3);
 
   /**
    * @brief Construct a new Weight object
    *
    * @param v ptr to already created variable tensor
    * @param g ptr to already created gradient tensor
+   * @param v32 ptr to already created variable32 tensor
    * @param reg Regularizer for the weight
    * @param reg_const Constant multiplier for regularizer
    */
-  explicit Weight(Tensor *v, Tensor *g, const WeightRegularizer reg,
-                  const float reg_const, const float decay,
-                  bool is_dependent = false, const float max_norm = 0.0f,
-                  unsigned int output_axis_ = 3, float loss_scale_ = 0.0f) :
-    Var_Grad(v, g, is_dependent),
-    regularizer(reg),
-    regularizer_constant(reg_const),
-    decay(decay),
-    clip_by_global_norm(max_norm),
-    output_axis(output_axis_),
-    loss_scale(loss_scale_) {}
+  explicit Weight(Tensor *v, Tensor *g, Tensor *v32,
+                  const WeightRegularizer reg, const float reg_const,
+                  const float decay, bool is_dependent = false,
+                  const float max_norm = 0.0f, unsigned int output_axis_ = 3,
+                  float loss_scale_ = 0.0f);
 
   /**
    * @brief Swap for weight
@@ -170,6 +158,7 @@ class Weight : public Var_Grad {
     swap(lhs.output_axis, rhs.output_axis);
     swap(lhs.opt_vars, rhs.opt_vars);
     swap(lhs.loss_scale, rhs.loss_scale);
+    swap(lhs.var32, rhs.var32);
   }
 
   /**
@@ -213,6 +202,8 @@ class Weight : public Var_Grad {
       w.var = std::make_shared<Tensor>(this->var->clone());
     if (!this->grad->empty())
       w.grad = std::make_shared<Tensor>(this->grad->clone());
+    if (!this->var32->empty())
+      w.var32 = std::make_shared<Tensor>(this->var32->clone());
 
     return w;
   }
@@ -230,6 +221,16 @@ class Weight : public Var_Grad {
     opt_vars = tensors;
   }
 
+  /**
+   * @brief Add optimizer variables32
+   * We assume if the datatype of weight is not FP32, then it needs to set
+   * OptmizerVarialbe32 to maintain acccuracy.
+   * @param tensors OptimizerVariable32 Tensor list
+   */
+  void setOptimizerVariables32(std::vector<Tensor *> tensors) {
+    opt_vars32 = tensors;
+  }
+
   /**
    * @brief Get optimizer variable reference
    * @param idx Index of the optimizer variable to get
@@ -316,6 +317,16 @@ class Weight : public Var_Grad {
     return clip_by_global_norm > epsilon;
   }
 
+  /**
+   * @brief Check if the variable type is not full precision
+   *
+   * @return true if it is not full precsion
+   * @return false otherwise
+   */
+  bool isMixedPrecision() const {
+    return var->getDataType() == ml::train::TensorDim::DataType::FP32;
+  }
+
   /**
    * @brief clip the gradient value based on the given global norm
    *
@@ -338,6 +349,7 @@ class Weight : public Var_Grad {
   unsigned int output_axis;
   float loss_scale;
   std::vector<Tensor *> opt_vars; /**< optimizer variables */
+  std::vector<Tensor *> opt_vars32;
   std::shared_ptr<Tensor> var32;
 
   /**

From 0d899d57498003b4e92f452bd168c85426c21c3e Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 7 May 2024 13:24:00 +0900
Subject: [PATCH 02/14] [ Mixed ] Create weight with var32 tensor

This pr create the variable fp32 tensor when we create the Weight and
Optimizer Weight.

. update the manager to create Weight with  var32 tensor which
requested to weight pool.
. update the weight requests with Weight Spec and var, grad and var32
tensors which created already.
. add clone Tensor with specific type in tensor.h

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/network_graph.cpp  |  7 ++++---
 nntrainer/layers/layer_context.cpp | 13 +++++++++++++
 nntrainer/layers/layer_context.h   |  9 +++++++++
 nntrainer/layers/layer_node.h      |  8 ++++----
 nntrainer/tensor/manager.cpp       |  6 +++---
 nntrainer/tensor/manager.h         |  3 ++-
 nntrainer/tensor/tensor.cpp        | 12 ++++++++++++
 nntrainer/tensor/tensor.h          |  7 +++++++
 nntrainer/tensor/weight.cpp        | 28 +++++++++-------------------
 nntrainer/tensor/weight.h          | 15 ++++++++++++---
 10 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 370c18964f..c0ee126c93 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -768,6 +768,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
    * node is going to be used with in-place optimizations.
    */
   auto out_specs = init_context.getOutSpecs();
+
   /// @note try move inplace control to finalize
   bool shared_var = false, shared_grad = false;
   if (lnode->executeInPlace() != InPlace::NONE) {
@@ -1556,16 +1557,16 @@ void NetworkGraph::requestOptimizerVariable(
       const TensorDim &dim = w->getDim();
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
-        dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
+        dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
         w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
         Tensor::Initializer::ZEROS));
 
-      if (dim.getDataType() != ml::train::TensorDim::DataType::FP32) {
+      if (w->isMixedPrecision()) {
         for (auto &dim : dims)
           dim.setDataType(ml::train::TensorDim::DataType::FP32);
         w->setOptimizerVariables32(
           tensor_manager->requestWeightOptimizerVariables(
-            dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
+            dims, w->getName(), ":opt32:", TensorLifespan::MAX_LIFESPAN,
             w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
             Tensor::Initializer::ZEROS));
       }
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index fff2eb15ec..f0856c1dbb 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -169,6 +169,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
   return weights[idx]->getGradientRef();
 }
 
+/**
+ * @brief Get the Weight Gradient tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight grad tensor
+ */
+Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const {
+  if (!weights[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable weight.");
+  return weights[idx]->getVariableFP32Ref();
+}
+
 /**
  * @brief Get the Weight Optimizer Variable tensor object
  *
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index e5c6759638..e2f428aa2c 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -463,6 +463,15 @@ class RunLayerContext {
   Tensor &getWeightGrad(unsigned int idx) const;
 
   /**
+   * @brief Get the Weight Gradient tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return Tensor& Reference to the weight grad tensor
+   */
+  Tensor &getWeightFP32(unsigned int idx) const;
+
+  /**
+
    * @brief Get the Weight Optimizer Variable tensor object
    *
    * @param idx Identifier of the weight
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index f8e5b7c4e9..7dfb1bd1a0 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -496,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
     NNTR_THROW_IF(!run_context, std::runtime_error)
       << __func__ << " layer needs to be finalized first!";
     if (run_context->weightHasGradient(idx)) {
-      return Weight(run_context->getWeight(idx),
-                    run_context->getWeightGrad(idx),
-                    run_context->getWeightName(idx));
+      return Weight(
+        run_context->getWeight(idx), run_context->getWeightGrad(idx),
+        run_context->getWeightFP32(idx), run_context->getWeightName(idx));
     } else {
-      return Weight(run_context->getWeight(idx), Tensor(),
+      return Weight(run_context->getWeight(idx), Tensor(), Tensor(),
                     run_context->getWeightName(idx));
     }
   }
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 572bd217cf..b4ac106b12 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -689,8 +689,8 @@ bool Manager::isSecondLastAccess(const std::string &name,
  */
 std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   const std::vector<TensorDim> &dims, const std::string &name,
-  const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_precision,
-  Tensor::Initializer initializer) {
+  const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip,
+  bool is_mixed_precision, Tensor::Initializer initializer) {
 
   std::vector<Tensor *> ret;
   ret.reserve(dims.size());
@@ -706,7 +706,7 @@ std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   /// @note this is assuming weight optimizer variables is treated as weight, if
   /// not, there is room to optimize below behavior
   for (unsigned int idx = 0; idx < dims.size(); idx++)
-    ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx),
+    ret.push_back(weight_pool.request(name + suffix + std::to_string(idx),
                                       dims[idx], exec, lifespan, initializer));
 
   return ret;
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index 1fa810a35c..80ffb9d21d 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -224,7 +224,8 @@ class Manager {
    */
   std::vector<Tensor *> requestWeightOptimizerVariables(
     const std::vector<TensorDim> &dims, const std::string &name,
-    const TensorLifespan &lifespan, bool is_grad_clip, bool is_mixed_type,
+    const std::string &suffix, const TensorLifespan &lifespan,
+    bool is_grad_clip, bool is_mixed_type,
     Tensor::Initializer initializer = Tensor::Initializer::NONE);
 
   /**
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 4f1e8e0721..b14bbd7ae4 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3065,6 +3065,18 @@ Tensor Tensor::clone() const {
   return t;
 }
 
+Tensor Tensor::clone(ml::train::TensorDim::DataType type) const {
+  if (getDataType() == type)
+    return clone();
+
+  TensorDim dim = getDim();
+  dim.setDataType(type);
+  Tensor t(dim, true);
+  t.copyData(*this);
+  t.name = name;
+  return t;
+}
+
 void Tensor::reshape(const TensorDim &d) {
 
   NNTR_THROW_IF(!contiguous, std::invalid_argument)
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 211334da40..2ea0393e66 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -1680,6 +1680,13 @@ class Tensor {
    */
   Tensor clone() const;
 
+  /**
+   * @brief     Convient wrapper for inplace copy of @a this.
+   * @param[in] type output tensor data type
+   * @retval    Copied version of this
+   */
+  Tensor clone(ml::train::TensorDim::DataType type) const;
+
   /**
    * @brief     Save the Tensor into file
    * @param[in] file output file stream
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index f86c327842..904948491f 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -90,34 +90,24 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
   }
 }
 
-Weight::Weight(const Tensor &v, const Tensor &g, const std::string &n,
-               bool is_dependent, unsigned int output_axis_) :
+Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+               const std::string &n, bool is_dependent,
+               unsigned int output_axis_) :
   Var_Grad(v, g, n, is_dependent),
   regularizer(WeightRegularizer::NONE),
   regularizer_constant(1.0f),
   decay(0.0f),
   clip_by_global_norm(0.0f),
   output_axis(output_axis_),
-  loss_scale(0.0) {
+  loss_scale(0.0),
+  var32(std::make_shared<Tensor>(n + ":fp32")) {
 
-  std::string var32_suffix = ":fp32";
-  std::string var32_name = n + var32_suffix;
-
-  /**
-   * @note We assume here that Weight is created with variable and gradient
-   * tensor. It is not copy or clone and, therefore, we do need create var32 if
-   * it is trainable. For now, We haven't seen the case create wieght with var,
-   * grad and var32. But we will add weight constructor if there is the cases.
-   */
-
-  if (!g.empty() && v.getDataType() != ml::train::TensorDim::DataType::FP32) {
+  if (!g.empty() && isMixedPrecision()) {
     TensorDim var32_dim(v.getDim());
     var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
-
-    var32 = std::make_shared<Tensor>(var32_dim, true, Tensor::Initializer::NONE,
-                                     var32_name);
-  } else {
-    var32 = std::make_shared<Tensor>(var32_name);
+    if (!v32.empty())
+      var32 = std::make_shared<Tensor>(
+        v32.getSharedDataTensor(var32_dim, 0, false, n + ":fp32"));
   }
 }
 
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 3a81d1f58e..1d69d7c72d 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -114,6 +114,7 @@ class Weight : public Var_Grad {
    *
    * @param v Already created variable object
    * @param g Already created gradient object
+   * @param v32 Already created gradient object
    * @param n Name for this Weight
    *
    * @note This is primarily used to created wrapper of variable extracted from
@@ -123,8 +124,9 @@ class Weight : public Var_Grad {
    * uses only, as Weight does not own the tensors v and g, and can go invalid
    * if the owner of these tensors free the tensors.
    */
-  explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "",
-                  bool is_dependent = false, unsigned int output_axis_ = 3);
+  explicit Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+                  const std::string &n = "", bool is_dependent = false,
+                  unsigned int output_axis_ = 3);
 
   /**
    * @brief Construct a new Weight object
@@ -324,7 +326,7 @@ class Weight : public Var_Grad {
    * @return false otherwise
    */
   bool isMixedPrecision() const {
-    return var->getDataType() == ml::train::TensorDim::DataType::FP32;
+    return var->getDataType() != ml::train::TensorDim::DataType::FP32;
   }
 
   /**
@@ -337,6 +339,13 @@ class Weight : public Var_Grad {
       grad->multiply_i(clip_by_global_norm / (global_norm + epsilon));
   }
 
+  /**
+   * @brief Get the variable FP32 tensor (by reference)
+   *
+   * @return Tensor Variable FP32 tensor
+   */
+  Tensor &getVariableFP32Ref() { return *var32.get(); }
+
 private:
   static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */
   static constexpr float epsilon_decay =

From 6b71b42ca1d42de4f0825729a8953da51cdcc0f0 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 7 May 2024 13:24:49 +0900
Subject: [PATCH 03/14] [ Layers ] Update Layers to support FP16

This PR enables the FP16 support for the layers below:

. input layer
. mse loss layer

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/layers/input_layer.cpp         | 19 ++++++++++++++++---
 nntrainer/layers/input_layer.h           |  3 ++-
 nntrainer/layers/loss/mse_loss_layer.cpp | 11 ++++++++++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/nntrainer/layers/input_layer.cpp b/nntrainer/layers/input_layer.cpp
index eabd40b297..d9f058d8ce 100644
--- a/nntrainer/layers/input_layer.cpp
+++ b/nntrainer/layers/input_layer.cpp
@@ -33,8 +33,7 @@ namespace nntrainer {
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 InputLayer::InputLayer() :
-  Layer(),
-  input_props(props::Normalization(), props::Standardization()) {}
+  Layer(), input_props(props::Normalization(), props::Standardization()) {}
 
 void InputLayer::setProperty(const std::vector<std::string> &values) {
   auto remain_props = loadProperties(values, input_props);
@@ -47,7 +46,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
   if (!context.executeInPlace()) {
     Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-    hidden_.copy(input_);
+    hidden_.copyData(input_);
   }
 
   if (std::get<props::Normalization>(input_props))
@@ -70,7 +69,21 @@ void InputLayer::finalize(InitLayerContext &context) {
 
   std::vector<TensorDim> output_dims = context.getInputDimensions();
 
+  for (auto &d : output_dims) {
+    d.setDataType(context.getActivationDataType());
+  }
+
   context.setOutputDimensions(output_dims);
+
+  is_inplace = true;
+
+  /**
+   * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the
+   * activation data type is not fp32, then it does not support in-place
+   * operation.
+   */
+  if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32)
+    is_inplace = false;
 }
 
 } /* namespace nntrainer */
diff --git a/nntrainer/layers/input_layer.h b/nntrainer/layers/input_layer.h
index f6728d676b..e9183e23d1 100644
--- a/nntrainer/layers/input_layer.h
+++ b/nntrainer/layers/input_layer.h
@@ -82,7 +82,7 @@ class InputLayer : public Layer {
   /**
    * @copydoc Layer::supportInPlace()
    */
-  bool supportInPlace() const override { return true; }
+  bool supportInPlace() const override { return is_inplace; }
 
   /**
    * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
@@ -105,6 +105,7 @@ class InputLayer : public Layer {
 
 private:
   std::tuple<props::Normalization, props::Standardization> input_props;
+  bool is_inplace;
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index 7f7bd1626f..ec9bc9b844 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
-  Tensor &y = context.getInput(SINGLE_INOUT_IDX);
+
+  Tensor empty_tensor;
+  Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+                  ml::train::TensorDim::DataType::FP32
+                ? context.getInput(SINGLE_INOUT_IDX)
+                : empty_tensor;
+
+  if (y.empty())
+    y = context.getInput(SINGLE_INOUT_IDX)
+          .clone(ml::train::TensorDim::DataType::FP32);
 
   // hidden_ <- y2 - y;
   if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {

From 544de98e10ae11b0a667588d679d38694b5a9ebc Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 7 May 2024 13:26:42 +0900
Subject: [PATCH 04/14] [ Test ] Mixed Precision Test Case

This PR includes the mixed precision test case.

. Input - FC - MSE
 : "batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 Applications/KNN/jni/meson.build              |  2 +-
 nntrainer/graph/network_graph.cpp             |  2 +-
 test/unittest/models/meson.build              |  4 ++
 .../unittest_models_mixed_precision.cpp       | 54 +++++++++++++++++++
 4 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 test/unittest/models/unittest_models_mixed_precision.cpp

diff --git a/Applications/KNN/jni/meson.build b/Applications/KNN/jni/meson.build
index bc50dc0214..58ca099d75 100644
--- a/Applications/KNN/jni/meson.build
+++ b/Applications/KNN/jni/meson.build
@@ -15,4 +15,4 @@ e = executable('knn_sample',
   install_dir: application_install_dir
 )
 
-test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
+test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index c0ee126c93..97e20f6a24 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -771,7 +771,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
 
   /// @note try move inplace control to finalize
   bool shared_var = false, shared_grad = false;
-  if (lnode->executeInPlace() != InPlace::NONE) {
+  if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     for (unsigned int i = 0; i < out_specs.size(); ++i) {
       auto &s = out_specs.at(i);
diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build
index 7166fc41ff..4a6e81e65d 100644
--- a/test/unittest/models/meson.build
+++ b/test/unittest/models/meson.build
@@ -11,6 +11,10 @@ models_targets = [
   # disable temperally
 ]
 
+if get_option('enable-fp16')
+   models_targets += 'unittest_models_mixed_precision.cpp'
+endif
+
 test_target += models_targets
 exe = executable(
   test_name,
diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp
new file mode 100644
index 0000000000..becf11ff44
--- /dev/null
+++ b/test/unittest/models/unittest_models_mixed_precision.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Jijoong Moon <jijoong.moon@samsung.com>
+ *
+ * @file unittest_models_mixed_precision.cpp
+ * @date 3 May 2024
+ * @brief unittest models to cover mixed precision
+ * @see	https://github.com/nnstreamer/nntrainer
+ * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include <ini_wrapper.h>
+#include <neuralnet.h>
+#include <nntrainer_test_util.h>
+
+#include <models_golden_test.h>
+
+using namespace nntrainer;
+
+static std::unique_ptr<NeuralNetwork> fc_mixed_training() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty(
+    {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"});
+
+  auto graph = makeGraph({
+    {"input", {"name=in", "input_shape=1:1:3"}},
+    {"Fully_connected", {"name=fc", "input_layers=in", "unit=10"}},
+    {"mse", {"name=loss", "input_layers=fc"}},
+  });
+  for (auto &node : graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate = 0.1"}));
+
+  return nn;
+}
+
+GTEST_PARAMETER_TEST(
+  MixedPrecision, nntrainerModelTest,
+  ::testing::ValuesIn({
+    mkModelTc_V2(fc_mixed_training, "fc_mixed_training",
+                 ModelTestOption::NO_THROW_RUN_V2),
+    /** ModelTestOption::ALL_V2),
+     * Disabled for now to check
+     */
+  }),
+  [](const testing::TestParamInfo<nntrainerModelTest::ParamType> &info)
+    -> const auto & { return std::get<1>(info.param); });

From 26ea6af4430f7bae48d37c9ce9490cbaa81b3ec6 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Thu, 9 May 2024 14:52:07 +0900
Subject: [PATCH 05/14] [ Optimizer ] Update Optimizer / Adam to support Mixed
 training

This commit modify apply gradient in optimizer.
We do not need to save optimizer variables in weight type. Only
Optimizer needs the optimizer variables and we should update the
weight with full precision to maintain the accuracy. Therefore,
remove the var32 tensors for optimizer variables.

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/network_graph.cpp          | 10 ------
 nntrainer/optimizers/adam.cpp              | 27 +++++++++++++---
 nntrainer/optimizers/optimizer_context.cpp |  7 ++++
 nntrainer/optimizers/optimizer_context.h   | 14 ++++++--
 nntrainer/tensor/manager.cpp               |  6 ++--
 nntrainer/tensor/weight.cpp                | 37 ++++++++++++++++++++++
 nntrainer/tensor/weight.h                  | 35 ++++++++++----------
 7 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 97e20f6a24..b7f4d1cffd 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -1560,16 +1560,6 @@ void NetworkGraph::requestOptimizerVariable(
         dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
         w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
         Tensor::Initializer::ZEROS));
-
-      if (w->isMixedPrecision()) {
-        for (auto &dim : dims)
-          dim.setDataType(ml::train::TensorDim::DataType::FP32);
-        w->setOptimizerVariables32(
-          tensor_manager->requestWeightOptimizerVariables(
-            dims, w->getName(), ":opt32:", TensorLifespan::MAX_LIFESPAN,
-            w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
-            Tensor::Initializer::ZEROS));
-      }
     }
   }
 }
diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
index 18c0a0fcc1..530e7fdf31 100644
--- a/nntrainer/optimizers/adam.cpp
+++ b/nntrainer/optimizers/adam.cpp
@@ -36,7 +36,15 @@ Adam::~Adam() {}
 enum AdamParams { wm, wv };
 
 std::vector<TensorDim> Adam::getOptimizerVariableDim(const TensorDim &dim) {
-  return {dim, dim};
+  /**
+   * @note We assume the optimizer parameters should be full precsion to
+   * maintain the accuracy even in mixed precision training.
+   */
+  TensorDim wm_dim(dim);
+  TensorDim wv_dim(dim);
+  wm_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  wv_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  return {wm_dim, wv_dim};
 }
 
 void Adam::exportTo(Exporter &exporter,
@@ -64,7 +72,15 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const {
 }
 
 void Adam::applyGradient(RunOptimizerContext &context) {
-  Tensor &x_grad = context.getGradient();
+  Tensor empty_tensor;
+
+  Tensor &x_grad =
+    context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+      ? context.getGradient()
+      : empty_tensor;
+
+  if (x_grad.empty())
+    x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
 
   auto &beta1 = std::get<PropsB1>(adam_props).get();
   auto &beta2 = std::get<PropsB2>(adam_props).get();
@@ -91,7 +107,7 @@ void Adam::applyGradient(RunOptimizerContext &context) {
     denom.add_i(epsilon);
     wm.divide(denom, x_grad);
 
-    context.applyGradient(context.getLearningRate() / biasCorrection1);
+    context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
 
   } else {
     std::function<double(double)> sqrtEps = [epsilon](double f) {
@@ -100,8 +116,9 @@ void Adam::applyGradient(RunOptimizerContext &context) {
 
     x_grad = wv.apply<float>(sqrtEps, x_grad);
     x_grad.multiply_i(wm);
-    context.applyGradient(getUpdatedLearningRate(context.getIteration(),
-                                                 context.getLearningRate()));
+    context.applyGradient(
+      getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
+      x_grad);
   }
 }
 
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
index da4cd1f7e9..f70ab773a9 100644
--- a/nntrainer/optimizers/optimizer_context.cpp
+++ b/nntrainer/optimizers/optimizer_context.cpp
@@ -42,4 +42,11 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
 void RunOptimizerContext::applyGradient(double lr) const {
   weight->applyGradient(lr);
 }
+
+/**
+ * @brief   Apply the gradient with the given learning rate and gradient
+ */
+void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const {
+  weight->applyGradient(lr, updated_grad);
+}
 } // namespace nntrainer
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
index 62f9e0945d..6b4b983e35 100644
--- a/nntrainer/optimizers/optimizer_context.h
+++ b/nntrainer/optimizers/optimizer_context.h
@@ -35,9 +35,7 @@ class RunOptimizerContext {
    *
    */
   RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) :
-    weight(w),
-    iteration(iter),
-    learning_rate(lr) {}
+    weight(w), iteration(iter), learning_rate(lr) {}
 
   /**
    * @brief Get the Weight tensor object
@@ -75,6 +73,16 @@ class RunOptimizerContext {
    */
   void applyGradient(double lr) const;
 
+  /**
+   * @brief   Apply the gradient with the given learning rate and updated
+   * gradient
+   *
+   * @param lr learning rate
+   * @param updated_grad gradient tensor which is updated. (usually it could be
+   * fp32)
+   */
+  void applyGradient(double lr, Tensor &updated_grad) const;
+
   /**
    * @brief   Get the current iteration value
    *
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index b4ac106b12..14d710b3c0 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -471,9 +471,9 @@ std::vector<Weight *> Manager::requestWeights(
       }
     }
 
-    weights_v2.emplace_back(
-      std::make_unique<Weight>(var, grad, var32, w_reg, w_reg_const, decay,
-                               is_dependent, clip_by_global_norm));
+    weights_v2.emplace_back(std::make_unique<Weight>(
+      var, grad, var32, w_reg, w_reg_const, decay, is_dependent,
+      clip_by_global_norm, axis, loss_scale));
   }
 
   std::transform(weights_v2.begin() + current_size, weights_v2.end(),
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index 904948491f..d8db5ba094 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -127,4 +127,41 @@ Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg,
     var32 = std::make_shared<Tensor>();
 }
 
+void Weight::applyGradient(double lr, Tensor &updated_grad) {
+  if (isMixedPrecision() &&
+      updated_grad.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    updated_grad.divide(loss_scale);
+    var32->add_i(updated_grad, -lr);
+    quantizeWeight();
+    return;
+  }
+
+  return applyGradient(lr);
+}
+
+void Weight::quantizeWeight() {
+  if (!isMixedPrecision())
+    return;
+
+  Tensor &var = getVariableRef();
+  ml::train::TensorDim::DataType type = var.getDataType();
+  switch (type) {
+  case ml::train::TensorDim::DataType::QINT4:
+    // NYI
+    break;
+  case ml::train::TensorDim::DataType::QINT8:
+    // NYI
+    break;
+  case ml::train::TensorDim::DataType::FP16:
+    getVariableRef().copy(getVariableFP32Ref());
+    break;
+  case ml::train::TensorDim::DataType::FP32:
+    break;
+  default:
+    break;
+  }
+
+  return;
+}
+
 } // namespace nntrainer
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 1d69d7c72d..5382c686e1 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -46,7 +46,7 @@ class Weight : public Var_Grad {
     decay(0.0f),
     clip_by_global_norm(0.0f),
     output_axis(3),
-    loss_scale(0.0) {}
+    loss_scale(1.0) {}
 
   /**
    * @brief Construct a new Weight object
@@ -66,7 +66,7 @@ class Weight : public Var_Grad {
     const float reg_const = 1.0f, const float decay = 0.0f,
     const float clip_by_global_norm = 0.0f, bool ng = true,
     bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+    float loss_scale_ = 1.0);
 
   /**
    * @brief Construct a new Weight object
@@ -87,7 +87,7 @@ class Weight : public Var_Grad {
     const float reg_const = 1.0f, const float decay = 0.0f,
     const float clip_by_global_norm = 0.0f, bool ng = true,
     bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+    float loss_scale_ = 1.0);
 
   /**
    * @brief Construct a new Weight object
@@ -141,7 +141,7 @@ class Weight : public Var_Grad {
                   const WeightRegularizer reg, const float reg_const,
                   const float decay, bool is_dependent = false,
                   const float max_norm = 0.0f, unsigned int output_axis_ = 3,
-                  float loss_scale_ = 0.0f);
+                  float loss_scale_ = 1.0f);
 
   /**
    * @brief Swap for weight
@@ -223,16 +223,6 @@ class Weight : public Var_Grad {
     opt_vars = tensors;
   }
 
-  /**
-   * @brief Add optimizer variables32
-   * We assume if the datatype of weight is not FP32, then it needs to set
-   * OptmizerVarialbe32 to maintain acccuracy.
-   * @param tensors OptimizerVariable32 Tensor list
-   */
-  void setOptimizerVariables32(std::vector<Tensor *> tensors) {
-    opt_vars32 = tensors;
-  }
-
   /**
    * @brief Get optimizer variable reference
    * @param idx Index of the optimizer variable to get
@@ -297,6 +287,13 @@ class Weight : public Var_Grad {
    */
   void applyGradient(double lr) { var->add_i(*grad.get(), -lr); }
 
+  /**
+   * @brief     Apply the gradient to the weight with updated gradient
+   * @param[in] updated_grad gradient tensor which is updated in optimizer
+   * it might be different data type with gradient in weight. .eg : FP32
+   */
+  void applyGradient(double lr, Tensor &updated_grad);
+
   /**
    * @brief Check if the gradient is supposed to be clipped by global norm with
    * the given max_norm value
@@ -346,6 +343,12 @@ class Weight : public Var_Grad {
    */
   Tensor &getVariableFP32Ref() { return *var32.get(); }
 
+  /**
+   * @brief Quantize var32 to var
+   *
+   */
+  void quantizeWeight();
+
 private:
   static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */
   static constexpr float epsilon_decay =
@@ -357,8 +360,8 @@ class Weight : public Var_Grad {
   float clip_by_global_norm; /**< constant factor to clip gradient by L2 norm */
   unsigned int output_axis;
   float loss_scale;
-  std::vector<Tensor *> opt_vars; /**< optimizer variables */
-  std::vector<Tensor *> opt_vars32;
+  std::vector<Tensor *>
+    opt_vars; /**< optimizer variables : We assume it is always full-precsion*/
   std::shared_ptr<Tensor> var32;
 
   /**

From 738ee93089f762ce49836ef83786c5d979f1c057 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Wed, 8 May 2024 19:04:18 +0900
Subject: [PATCH 06/14] [ Tensor ] add is_NaN check in Tensor

This PR add is_NaN function to check if the tensor has NaN value. This
is for the check NaN during mixed precision training.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 meson.build                         | 17 +++--
 nntrainer/tensor/blas_avx.cpp       | 96 +++++++++++++++++++++++++++++
 nntrainer/tensor/blas_avx.h         | 20 ++++++
 nntrainer/tensor/blas_interface.cpp | 36 +++++++++++
 nntrainer/tensor/blas_interface.h   | 10 +++
 nntrainer/tensor/blas_neon.cpp      | 39 ++++++++++++
 nntrainer/tensor/blas_neon.h        | 18 ++++++
 nntrainer/tensor/meson.build        |  9 ++-
 nntrainer/tensor/tensor.cpp         | 12 ++++
 nntrainer/tensor/tensor.h           |  6 ++
 packaging/nntrainer.spec            | 13 +++-
 test/unittest/models/meson.build    | 23 ++++++-
 12 files changed, 288 insertions(+), 11 deletions(-)

diff --git a/meson.build b/meson.build
index d4aea330a4..7ae692e6d9 100644
--- a/meson.build
+++ b/meson.build
@@ -64,9 +64,19 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+   extra_defines += '-DUSE_AVX=1'
+   if get_option('platform') == 'tizen'
+      add_project_arguments(['-mavx2'], language: ['c','cpp'])
+   else
+      add_project_arguments(['-march=native'], language: ['c','cpp'])
+   endif
+   message('-march=native added for AVX hardware acceleration.')
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +115,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index ce59583d6f..2fd4908463 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 void vcvt_f16_f32(size_t N, const void *input, float *output) {
   assert(N != 0);
   assert(input != NULL);
@@ -114,4 +115,99 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
+bool hasNaN(const size_t N, const _Float16 *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    const __m256 vec1 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));
+
+    input += 16;
+
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return true;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+  }
+
+  // remain check : ( X != X )
+  while (idx < N) {
+    if (*input != *input) {
+      return true;
+    }
+    ++input;
+  }
+
+  return false;
+}
+#endif
+
+bool hasNaN(const size_t N, const float *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 = _mm256_loadu_ps(input);
+    const __m256 vec1 = _mm256_loadu_ps(input + 8);
+    input += 16;
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return true;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_loadu_ps(input);
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+  }
+
+  // remain check : ( X != X )
+  while (idx < N) {
+    if (*input != *input) {
+      return true;
+    }
+    ++input;
+  }
+
+  return false;
+}
+
 } // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
index ab1270a208..d25ded103f 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/blas_avx.h
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 /**
  * @brief Converts half-precision floating point values to single-precision
  * floating point values.
@@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
  */
 void vcvt_f32_f16(size_t N, const float *input, void *output);
 
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X half-precision * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const _Float16 *X);
+#endif
+
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const float *X);
+
 } // namespace nntrainer::avx
 
 #endif /* __cplusplus */
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 9be6fb9911..78cb708e53 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -1038,6 +1038,16 @@ static void ele_div_fallback(const unsigned int N, const float *X,
   }
 }
 
+static bool has_nan_fallback(const size_t N, const float *X) {
+  for (size_t i = 0; i < N; ++i) {
+    if (*X != *X)
+      return true;
+    ++X;
+  }
+
+  return false;
+}
+
 void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
              float alpha, float beta, unsigned int i_stride,
              unsigned int o_stride) {
@@ -1090,4 +1100,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
     ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
 }
 
+bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
+             const void *X) {
+  if (d_type == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+    const _FP16 *vec = (const _FP16 *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::hasNaN(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::hasNaN(N, vec);
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+#endif
+  } else if (d_type == ml::train::TensorDim::DataType::FP32) {
+    const float *vec = (const float *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::hasNaN(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::hasNaN(N, vec);
+#endif
+
+    return has_nan_fallback(N, vec);
+  }
+  return false;
+}
+
 } // namespace nntrainer
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 04a8a23018..bcd557111e 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
              unsigned int o_stride = 1);
+
+/**
+ * @brief     check if X array has NaN
+ * @param[in] N  length of the vector
+ * @param[in] X float/fp16 * for Vector X
+ * @param[out] bool true if NaN else false
+ */
+bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
+             const void *X);
+
 } /* namespace nntrainer */
 #endif /* __cplusplus */
 #endif /* __BLAS_INTERFACE_H__ */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 6f02978e1f..5062bd7c35 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -546,6 +546,25 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
   }
 }
 
+bool hasNaN(const size_t N, const float *X) {
+  bool temp = false;
+  size_t i = 0;
+  for (; N - i >= 4; i += 4) {
+    float32x4_t vec = vld1q_f32(&X[i]);
+    uint32x4_t vcmp = vceqq_f32(vec, vec);
+    if (vaddvq_u32(vcmp))
+      return true;
+  }
+
+  while (i < N) {
+    if (X[i] != X[i])
+      return true;
+    ++i;
+  }
+
+  return temp;
+}
+
 #ifdef ENABLE_FP16
 
 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
@@ -1972,5 +1991,25 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
   }
 }
 
+bool hasNaN(const size_t N, const __fp16 *input) {
+  bool temp = 0;
+  size_t i = 0;
+  for (; N - i >= 8; i += 8) {
+    float16x8_t vec = vld1q_f16(&input[i]);
+    uint16x8_t vcmp = vceqq_f16(vec, vec);
+
+    if (vaddvq_u16(vcmp))
+      return true;
+  }
+
+  while (i < N) {
+    if (input[i] != input[i])
+      return true;
+    ++i;
+  }
+
+  return temp;
+}
+
 #endif
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index db1b6a5ccc..6da5e952e1 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f);
 
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] input float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const float *input);
+
 #ifdef ENABLE_FP16
 /**
  * @brief     hgemv computation with neon : Y = alpha*A*X + beta*Y
@@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
  * @param X __fp16 * for Vector X
  */
 void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
+
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const __fp16 *X);
 #endif
 
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 0884dbd3b4..b14fa0ee85 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -44,6 +44,12 @@ cl_headers = [
 
 
 arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+    tensor_sources += 'blas_avx.cpp'
+    tensor_headers += 'blas_avx.h'
+endif
+
 if get_option('enable-fp16') 
   if arch == 'arm'
     error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
@@ -55,9 +61,6 @@ if get_option('enable-fp16')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
     endif
-  elif get_option('enable-avx')
-    tensor_sources += 'blas_avx.cpp'
-    tensor_headers += 'blas_avx.h'
   endif
 endif
 
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index b14bbd7ae4..f9db2e2ab0 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3820,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
   return;
 }
 
+bool Tensor::hasNaN() const {
+  if (getDataType() == Tdatatype::FP16) {
+#ifdef ENABLE_FP16
+    return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
+#else
+    throw std::invalid_argument("enble-fp16 is not set");
+#endif
+  } else {
+    return has_nan(dim.getDataLen(), Tdatatype::FP32, getData<float>());
+  }
+}
+
 // namespace nntrainer
 
 } /* namespace nntrainer */
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 2ea0393e66..968ec4d502 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -2038,6 +2038,12 @@ class Tensor {
 
   static constexpr float epsilon = 1e-5;
 
+  /**
+   * @brief      check if there is NaN element
+   * @param[out] bool true if there is NaN else false
+   */
+  bool hasNaN() const;
+
 private:
   /**< handle the data as a std::shared_ptr<float> type */
   TensorDim dim;
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 36ba371d22..2f1dc57f68 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -65,6 +65,13 @@
 %define neon_support -Denable-neon=false
 %endif # arch aarch64
 
+%ifarch x86_64
+%define enable_avx 1
+%define avx_support -Denable-avx=true
+%else
+%define avx_support -Denable-avx=false
+%endif # arch aarch64
+
 
 Name:		nntrainer
 Summary:	Software framework for training neural networks
@@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \
       %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \
       -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \
       -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \
-      %{fp16_support} %{neon_support} build
+      %{fp16_support} %{neon_support} %{avx_support} build
 
 ninja -C build %{?_smp_mflags}
 
@@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
 %{_includedir}/nntrainer/util_simd_neon.h
 %endif
 
+%if 0%{?enable_avx}
+%{_includedir}/nntrainer/blas_avx.h
+%endif
+
 %files devel-static
 %{_libdir}/libnntrainer*.a
 %exclude %{_libdir}/libcapi*.a
diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build
index 4a6e81e65d..3f17369f94 100644
--- a/test/unittest/models/meson.build
+++ b/test/unittest/models/meson.build
@@ -1,4 +1,5 @@
 test_name = 'unittest_models'
+mixed_test_name = 'unittest_mixed_models'
 
 test_target = []
 
@@ -11,8 +12,28 @@ models_targets = [
   # disable temperally
 ]
 
+mixed_test_targets = [
+  'models_test_utils.cpp',
+  'models_golden_test.cpp',
+  'unittest_models_mixed_precision.cpp',
+]
+
 if get_option('enable-fp16')
-   models_targets += 'unittest_models_mixed_precision.cpp'
+  mixed_exe = executable(
+    mixed_test_name,
+    mixed_test_targets,
+    include_directories: include_directories('.'),
+    dependencies: [
+      nntrainer_test_main_deps, nntrainer_ccapi_dep
+    ],
+    install: get_option('enable-test'),
+    install_dir: application_install_dir
+  )
+
+  test(mixed_test_name, mixed_exe,
+    args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name),
+    timeout: test_timeout
+  )
 endif
 
 test_target += models_targets

From 364d80affafca442ba72582aa0d12b0202c79783 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Sat, 11 May 2024 14:00:04 +0900
Subject: [PATCH 07/14] [ Context ] Add loss scale in Context & using mse loss

This PR add loss scale parameter in runcontext and use it to update
mse loss.

. Add Loss Scale Parameter in RunLayerContext Constructor
. Add applyLossScale func to update return derivitive in Loss Layer
. Change MSE Loss Layer to apply the loss scale to return derivitive

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/network_graph.cpp            |  6 ++--
 nntrainer/layers/layer_context.cpp           |  3 +-
 nntrainer/layers/layer_context.h             | 36 +++++++++++++++++---
 nntrainer/layers/layer_node.cpp              |  9 ++---
 nntrainer/layers/layer_node.h                |  3 +-
 nntrainer/layers/loss/loss_layer.cpp         |  9 ++++-
 nntrainer/layers/loss/loss_layer.h           |  7 ++++
 nntrainer/layers/loss/mse_loss_layer.cpp     | 13 ++++++-
 nntrainer/layers/time_dist.cpp               | 16 ++++-----
 nntrainer/models/model_common_properties.h   |  2 +-
 nntrainer/tensor/weight.cpp                  |  2 +-
 test/unittest/layers/layers_golden_tests.cpp |  2 +-
 test/unittest/layers/unittest_layer_node.cpp |  4 +--
 13 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index b7f4d1cffd..297cd3e881 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -880,7 +880,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
                                    lnode->getTrainable(), shared_weight_names),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
@@ -1028,7 +1029,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
     // TODO: update weights spec for trainable based on layer trainable prop
     weights, inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index f0856c1dbb..fbbc9ecaff 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -126,13 +126,14 @@ const std::vector<VarGradSpecV2> &InitLayerContext::getOutSpecs() const {
 }
 
 RunLayerContext::RunLayerContext(const std::string &name, bool trainable,
-                                 float l, bool in_place_,
+                                 float l, bool in_place_, float loss_scale_,
                                  const std::vector<Weight *> &w,
                                  const std::vector<Var_Grad *> &in,
                                  const std::vector<Var_Grad *> &out,
                                  const std::vector<Var_Grad *> &t) :
   loss(l),
   in_place(in_place_),
+  loss_scale(loss_scale_),
   weights(w),
   inputs(in),
   outputs(out),
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index e2f428aa2c..09bccc2c73 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -63,7 +63,7 @@ class InitLayerContext {
                    const float max_norm = 0.0,
                    std::array<std::string, 3> tensor_type_ = {"NCHW", "FP32",
                                                               "FP32"},
-                   const float loss_scale = 0.0);
+                   const float loss_scale = 1.0);
   /**
    * @brief   get Tensor Format of Layer
    *
@@ -348,6 +348,14 @@ class InitLayerContext {
    */
   bool executeInPlace() const { return in_place; }
 
+  /**
+   * @brief   get Initial value of Loss_Scale. This is set to RunLayerContext
+   * and updated
+   *
+   * @return loss_scale
+   */
+  float getLossScale() const { return loss_scale; }
+
 private:
   std::vector<TensorDim> input_dim; /**< Input dimensions for the layer */
   bool in_place;             /**< if the layer is expected to run in-place */
@@ -385,7 +393,7 @@ class RunLayerContext {
    * @brief Construct a new Run Layer Context object
    *
    */
-  RunLayerContext() : loss(0.0), in_place(false) {}
+  RunLayerContext() : loss(0.0), in_place(false), loss_scale(1.0) {}
 
   /**
    * @brief Construct a new Run Layer Context object
@@ -396,6 +404,17 @@ class RunLayerContext {
     std::get<props::Name>(props).set(name);
   }
 
+  /**
+   * @brief Construct a new Run Layer Context object
+   *
+   */
+  RunLayerContext(const std::string &name, bool in_place_, float loss_scale_) :
+    RunLayerContext() {
+    in_place = in_place_;
+    std::get<props::Name>(props).set(name);
+    loss_scale = loss_scale_;
+  }
+
   /**
    * @brief Construct a new Run Layer Context object
    *
@@ -403,13 +422,15 @@ class RunLayerContext {
    * @param trainable if the layer is trainable
    * @param l loss of the layer
    * @param in_place_ execution in-place of the layer
+   * @param loss_scale loss_scale of the layer
    * @param w weights of the layer
    * @param in inputs of the layer
    * @param out outputs of the layer
    * @param t extra tensors of the layer
    */
   RunLayerContext(const std::string &name, bool trainable, float l,
-                  bool in_place_, const std::vector<Weight *> &w,
+                  bool in_place_, float loss_scale_,
+                  const std::vector<Weight *> &w,
                   const std::vector<Var_Grad *> &in,
                   const std::vector<Var_Grad *> &out,
                   const std::vector<Var_Grad *> &t);
@@ -883,10 +904,17 @@ class RunLayerContext {
    */
   ml::train::LayerComputeEngine getComputeEngine() { return compute_engine; }
 
+  /**
+   * @brief get loss scale
+   * @return loss scale
+   */
+  float getLossScale() { return loss_scale; }
+
 private:
   std::tuple<props::Name, props::Trainable> props; /**< props of the layer */
   float loss;                                      /**< loss of the layer */
-  bool in_place; /**< if the layer is expected to run in-place */
+  bool in_place;    /**< if the layer is expected to run in-place */
+  float loss_scale; /**< loss_scale of the layer */
 
   std::vector<Weight *> weights;   /**< weights of the layer */
   std::vector<Var_Grad *> inputs;  /**< inputs of the layer */
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index 8b18d80762..f41752a4d8 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -599,7 +599,7 @@ InitLayerContext LayerNode::finalize(const std::vector<TensorDim> &input_dims,
 
   const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom();
   float max_norm = 0.0;
-  float loss_scale = 0.0;
+  float loss_scale = 1.0;
   if (!std::get<props::ClipGradByGlobalNorm>(*layer_node_props).empty())
     max_norm = std::get<props::ClipGradByGlobalNorm>(*layer_node_props).get();
 
@@ -864,10 +864,11 @@ float LayerNode::getLoss() const { return *loss; }
 void LayerNode::configureRunContext(const std::vector<Weight *> &weights,
                                     const std::vector<Var_Grad *> &inputs,
                                     const std::vector<Var_Grad *> &outputs,
-                                    const std::vector<Var_Grad *> &tensors) {
+                                    const std::vector<Var_Grad *> &tensors,
+                                    float loss_scale) {
   run_context = std::make_unique<RunLayerContext>(
-    getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, weights,
-    inputs, outputs, tensors);
+    getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE,
+    loss_scale, weights, inputs, outputs, tensors);
 }
 
 /**
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 7dfb1bd1a0..3fd2d55b97 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -820,7 +820,8 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
   void configureRunContext(const std::vector<Weight *> &weights,
                            const std::vector<Var_Grad *> &inputs,
                            const std::vector<Var_Grad *> &outputs,
-                           const std::vector<Var_Grad *> &tensors);
+                           const std::vector<Var_Grad *> &tensors,
+                           float loss_scale);
 
   /**
    * @brief Preset modes for printing summary for the layer
diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp
index 40f74717f8..ab2ccf8be2 100644
--- a/nntrainer/layers/loss/loss_layer.cpp
+++ b/nntrainer/layers/loss/loss_layer.cpp
@@ -22,7 +22,7 @@ void LossLayer::finalize(InitLayerContext &context) {
     d.setDataType(
       str_converter<enum_class_prop_tag,
                     nntrainer::TensorDataTypeInfo>::from_string("FP32"));
-  
+
   context.setOutputDimensions(output_dim);
 }
 
@@ -36,6 +36,13 @@ void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) {
   context.setLoss(loss_sum / (float)l.batch());
 }
 
+void LossLayer::applyLossScale(RunLayerContext &context, Tensor &ret_deriv) {
+
+  float loss_scale = context.getLossScale();
+  if (loss_scale != 1.0)
+    ret_deriv.multiply_i(loss_scale);
+}
+
 /**
  * @copydoc Layer::setProperty(const std::vector<std::string> &values)
  */
diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h
index 00b520f6e6..581e9477a8 100644
--- a/nntrainer/layers/loss/loss_layer.h
+++ b/nntrainer/layers/loss/loss_layer.h
@@ -60,6 +60,13 @@ class LossLayer : public Layer {
    */
   void updateLoss(RunLayerContext &context, const Tensor &l);
 
+  /**
+   * @brief     update return derivative with loss scale
+   * @param     context Run context to update
+   * @param     return_dev Tensor data to calculate
+   */
+  void applyLossScale(RunLayerContext &context, Tensor &l);
+
   Tensor
     l; /**< loss tensor to store intermediate value to calculate loss value */
 };
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index ec9bc9b844..3aed8125e0 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -50,8 +50,17 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
 }
 
 void MSELossLayer::calcDerivative(RunLayerContext &context) {
+  Tensor empty_tensor;
   Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
-  const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+  const Tensor &y2_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+  Tensor &y2 = empty_tensor;
+
+  if (ret_derivative.getDataType() == ml::train::TensorDim::DataType::FP32)
+    y2 = y2_;
+
+  if (y2.empty())
+    y2 = y2_.clone(ret_derivative.getDataType());
+
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   y.subtract(y2, ret_derivative);
@@ -60,6 +69,8 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
     throw std::runtime_error(
       "[MSELossLayer::calcDerivative] Error when calculating loss");
   }
+
+  LossLayer::applyLossScale(context, ret_derivative);
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp
index 80451416df..779010065a 100644
--- a/nntrainer/layers/time_dist.cpp
+++ b/nntrainer/layers/time_dist.cpp
@@ -256,8 +256,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->forwarding(dist_context, training);
   }
@@ -303,8 +303,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->calcDerivative(dist_context);
   }
@@ -354,8 +354,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->calcGradient(dist_context);
   }
@@ -396,8 +396,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->setBatch(dist_context, batch);
 
diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h
index 3776afefca..3435d18e96 100644
--- a/nntrainer/models/model_common_properties.h
+++ b/nntrainer/models/model_common_properties.h
@@ -217,7 +217,7 @@ class ModelTensorDataType final : public EnumProperty<ModelTensorDataTypeInfo> {
  */
 class LossScale : public Property<float> {
 public:
-  LossScale(float value = 0.0f);
+  LossScale(float value = 1.0f);
   static constexpr const char *key = "loss_scale"; /**< unique key to access */
   using prop_tag = float_prop_tag;                 /**< property type */
 };
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index d8db5ba094..df262f50d9 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -99,7 +99,7 @@ Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
   decay(0.0f),
   clip_by_global_norm(0.0f),
   output_axis(output_axis_),
-  loss_scale(0.0),
+  loss_scale(1.0),
   var32(std::make_shared<Tensor>(n + ":fp32")) {
 
   if (!g.empty() && isMixedPrecision()) {
diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp
index 64400e6ecd..c71d653c05 100644
--- a/test/unittest/layers/layers_golden_tests.cpp
+++ b/test/unittest/layers/layers_golden_tests.cpp
@@ -156,7 +156,7 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) {
   };
 
   auto rc =
-    RunLayerContext("golden", true, 0.0f, false, create_view(weights),
+    RunLayerContext("golden", true, 0.0f, false, 1.0, create_view(weights),
                     create_view(ins), create_view(outs), create_view(tensors));
 
   auto num_outputs = rc.getNumOutputs();
diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp
index 3b41f02f30..37287f7ce5 100644
--- a/test/unittest/layers/unittest_layer_node.cpp
+++ b/test/unittest/layers/unittest_layer_node.cpp
@@ -131,7 +131,7 @@ TEST(nntrainer_LayerNode, finalize_05_n) {
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
   EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
   EXPECT_NO_THROW(lnode->finalize());
-  EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}));
+  EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}, 1.0));
   EXPECT_THROW(lnode->finalize(), std::runtime_error);
 }
 
@@ -298,7 +298,7 @@ TEST(nntrainer_LayerNode, setWeights_02_n) {
   EXPECT_NO_THROW(lnode =
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
   EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
-  EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}));
+  EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}, 1.0));
 
   EXPECT_THROW(lnode->setWeights(new_weights), std::runtime_error);
 }

From 77741a2341e0b7e0dbfcd8c8fd9e6d8db09c7df7 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Mon, 13 May 2024 17:09:00 +0900
Subject: [PATCH 08/14] [ Mixed Precision ] Enable Mixed Precision

This PR enables the Mixed Precision Training. For now only FP16-FP32
is considered. Additional Test cases will be added.

. add getSortedLayerIdx to set the graph order for fowarding.
. change clip_weights to lazy_apply_weights to use both cases.
. add fowarding_op to run forwarding from that layer which has a
gradient with nan.
. add while loop for re-run backwarding after reset the loss scale.
. add setLossScale in RunLayerContext
. add check the gradient if mixed precsion enable.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/graph_core.cpp     |   9 +++
 nntrainer/graph/graph_core.h       |   8 ++
 nntrainer/graph/network_graph.cpp  | 113 ++++++++++++++++++++++-------
 nntrainer/graph/network_graph.h    |  28 +++++--
 nntrainer/layers/layer_context.cpp |  11 +++
 nntrainer/layers/layer_context.h   |  26 +++++++
 nntrainer/models/neuralnet.cpp     |  43 +++++++++--
 nntrainer/tensor/blas_avx.cpp      |   4 +-
 nntrainer/tensor/weight.cpp        |   2 +-
 nntrainer/tensor/weight.h          |   7 ++
 10 files changed, 207 insertions(+), 44 deletions(-)

diff --git a/nntrainer/graph/graph_core.cpp b/nntrainer/graph/graph_core.cpp
index b624e066e4..3eafbb9261 100644
--- a/nntrainer/graph/graph_core.cpp
+++ b/nntrainer/graph/graph_core.cpp
@@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
   return Sorted.at(ith);
 }
 
+const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
+  return sorted_node_map.at(name);
+}
+
 void GraphCore::makeAdjacencyList(
   std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
   /** initialize the adj list */
@@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {
 
   if (Sorted.size() != node_list.size())
     throw std::runtime_error("Internal error in topologicalSort");
+  unsigned int idx = 0;
+  for (auto n : Sorted) {
+    sorted_node_map[n->getName()] = idx;
+    idx++;
+  }
 }
 
 const std::shared_ptr<GraphNode> &
diff --git a/nntrainer/graph/graph_core.h b/nntrainer/graph/graph_core.h
index 83d3ce7c39..77aa63666a 100644
--- a/nntrainer/graph/graph_core.h
+++ b/nntrainer/graph/graph_core.h
@@ -91,6 +91,13 @@ class GraphCore {
    */
   const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;
 
+  /**
+   * @brief getter of Sorted GraphNode index with name
+   * @param[in] layer name
+   * @ret index
+   */
+  const unsigned int getSortedNodeIdx(const std::string &name) const;
+
   /**
    * @brief getter of GraphNode with node name
    * @param[in] node name
@@ -252,6 +259,7 @@ class GraphCore {
   std::vector<std::shared_ptr<GraphNode>>
     node_list;                                    /**< Unordered Node List  */
   std::unordered_map<std::string, int> node_map;  /**< Unordered Node map  */
+  std::unordered_map<std::string, int> sorted_node_map;  /**< Unordered Node map  */
   std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List  */
   bool sorted; /** if the node_list is sorted */
 
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 297cd3e881..ac703e490b 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
       continue;
     }
 
-    if (rc.isGradientClipByGlobalNorm(i)) {
+    if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
       /**
        * @note the weights whose gradient are to be clipped by global norm will
        * be clipped at once at the end of iteration and applied then.
@@ -393,56 +393,100 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
   return out;
 }
 
-void NetworkGraph::backwarding(
+bool NetworkGraph::backwarding(
   int iteration,
-  std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
-  std::function<void(Weight &, int)> &apply_grad_clip_op,
-  std::function<bool(void *userdata)> stop_cb, void *userdata) const {
+  std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
+  std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
+  std::function<void(Weight &, int)> &lazy_apply_grad_op,
+  std::function<bool(void *userdata)> stop_cb, void *userdata) {
   /**
    * last layer backwarding is run out of this loop
    */
   auto iter_begin = getBackwardingBeginIter();
   auto iter_end = getBackwardingEndIter();
+  bool has_nan = false;
 
   /// there is no layer to train, so backwarding is essentially noop
   if (iter_begin == iter_end) {
-    return;
+    return true;
   }
 
   auto const &lptr_begin = (*iter_begin);
+  // graph_const_reverse_iterator
+  auto iter_ = iter_begin;
 
   if (lptr_begin->requireLabel() == false)
     throw std::runtime_error(
       "Error: last layer does not accept label, we can't train");
 
-  for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
-    auto &ln = *iter;
+  for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
+    auto &ln = *iter_;
     PROFILE_TIME_START(profile_keys.at(ln->getType()));
-    backwarding_op(ln, iteration);
+    has_nan = backwarding_op(ln, iteration);
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
+
+    if (has_nan) {
+      std::cout << "Gradient has NaN" << std::endl;
+      break;
+    }
   }
 
-  /** perform clipping of the gradients by global norm if any */
-  if (clip_weights.empty())
-    return;
+  if (has_nan) {
+    /** if has NaN
+     * 1. reset the loss scale.
+     * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
+     * 3. return false --> run backwarding again;
+     */
+    float scale = (*iter_)->getRunContext().getLossScale();
+    float s = scale > 1.5f ? scale - 0.5f : 1.0f;
+
+    resetLossScale(s);
 
-  /** calculate the global norm */
-  Tensor global_norm_t(
-    TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
-  float *global_norm_data = global_norm_t.getData();
-  for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
-    auto const &w = clip_weights[idx];
-    global_norm_data[idx] = w->getGradientNorm();
+    auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
+
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      PROFILE_TIME_START(profile_keys.at(ln->getType()));
+      forwarding_op(*iter, true);
+      PROFILE_TIME_END(profile_keys.at(ln->getType()));
+    }
+
+    return false;
   }
-  float global_norm = global_norm_t.l2norm();
-  /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    w->clipGradientByGlobalNorm(global_norm);
+
+  /** perform clipping of the gradients by global norm if any */
+  if (lazy_weights.empty())
+    return true;
+
+  if (is_clip_grad) {
+    /** calculate the global norm */
+    Tensor global_norm_t(
+      TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
+    float *global_norm_data = global_norm_t.getData();
+    for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
+      auto const &w = lazy_weights[idx];
+      global_norm_data[idx] = w->getGradientNorm();
+    }
+    float global_norm = global_norm_t.l2norm();
+    /** apply the gradient with the above global norm */
+    for (auto w : lazy_weights) {
+      w->clipGradientByGlobalNorm(global_norm);
+    }
   }
   /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    apply_grad_clip_op(*w, iteration);
+  for (auto w : lazy_weights) {
+    lazy_apply_grad_op(*w, iteration);
+  }
+  nan_count++;
+
+  if (nan_count > 10) {
+    float scale = (*iter_)->getRunContext().getLossScale();
+    float s = scale + 2.0f;
+    resetLossScale(s);
+    nan_count = 0;
   }
+
+  return true;
 }
 
 LayerNode *NetworkGraph::computeBackwardEnd() {
@@ -1290,11 +1334,19 @@ int NetworkGraph::initialize(ExecutionMode mode,
 
   /** select weights which would require clipping of the gradients by global
    * norm if any */
-  clip_weights = tensor_manager->getWeights([](const Weight *w) {
+  lazy_weights = tensor_manager->getWeights([](const Weight *w) {
     return w->hasGradient() && w->isGradientLastAccess() &&
-           w->isGradientClipByGlobalNorm();
+           (w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
   });
 
+  is_clip_grad = false;
+  for (auto w : lazy_weights) {
+    if (w->isGradientClipByGlobalNorm()) {
+      is_clip_grad = true;
+      break;
+    }
+  }
+
   return ML_ERROR_NONE;
 }
 
@@ -1566,4 +1618,11 @@ void NetworkGraph::requestOptimizerVariable(
   }
 }
 
+void NetworkGraph::resetLossScale(float scale) {
+  for (auto iter = cbegin(); iter != cend(); iter++) {
+    auto &ln = *iter;
+    ln->getRunContext().setLossScale(scale);
+  }
+}
+
 } /* namespace nntrainer */
diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h
index 5c9adf0363..22f14e1b73 100644
--- a/nntrainer/graph/network_graph.h
+++ b/nntrainer/graph/network_graph.h
@@ -51,7 +51,9 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format("NCHW"),
-    tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {}
+    tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {
+    nan_count = 0;
+  }
 
   /**
    * @brief     Constructor of NeuralNetwork Graph Class
@@ -73,7 +75,9 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format(tensor_format_),
-    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {}
+    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {
+    nan_count = 0;
+  }
 
   /**
    * @brief   Destructor of the NeuralNetwork Graph class
@@ -206,13 +210,14 @@ class NetworkGraph {
    * @param[in] backwarding_op operation for the backwarding
    * @param[in] apply_grad_clip_op operation for applying the clip gradients
    */
-  void backwarding(
+  bool backwarding(
     int iteration,
-    std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
-    std::function<void(Weight &, int)> &apply_grad_clip_op,
+    std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
+    std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
+    std::function<void(Weight &, int)> &lazy_apply_grad_op,
     std::function<bool(void *userdata)> stop_cb =
       [](void *user_data) { return false; },
-    void *user_data = nullptr) const;
+    void *user_data = nullptr);
 
   /**
    * @brief     get begin iterator for the graph
@@ -444,6 +449,12 @@ class NetworkGraph {
   getLayerExecutionOrders(const std::shared_ptr<LayerNode> &lnode);
 #endif // ENABLE_TEST
 
+  /**
+   * @brief     reset the loss scale
+   * @param[in] scale
+   */
+  void resetLossScale(float scale);
+
 private:
   std::map<std::string, std::string> sub_in_out; /** This is map to identify
                    input and output layer name of subgraph */
@@ -480,7 +491,10 @@ class NetworkGraph {
   std::unordered_map<std::string, int>
     profile_keys; /**< profile keys based on the layer type */
   std::vector<Weight *>
-    clip_weights; /**< weights with global norm based clipping enabled */
+    lazy_weights; /**< weights with global norm based clipping enabled */
+  bool is_clip_grad;
+
+  unsigned int nan_count;
 
   /**
    * @brief     topological sort
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index fbbc9ecaff..5862e6af14 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -416,6 +416,17 @@ bool RunLayerContext::isGradientClipByGlobalNorm(unsigned int idx) const {
   return weights[idx]->isGradientClipByGlobalNorm();
 }
 
+bool RunLayerContext::isMixedPrecision(unsigned int idx) const {
+  return weights[idx]->isMixedPrecision();
+}
+
+bool RunLayerContext::isMixedPrecision() const {
+  for (auto w : weights)
+    if (w->isMixedPrecision())
+      return true;
+  return false;
+}
+
 /**
  * @brief Get the tensor name
  *
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index 09bccc2c73..c68c42f11d 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -689,6 +689,20 @@ class RunLayerContext {
    */
   bool isGradientClipByGlobalNorm(unsigned int idx) const;
 
+  /**
+   * @brief check if the weight is mixed precsion
+   *
+   * @param idx index
+   * @return bool true if it is mixed precision
+   */
+  bool isMixedPrecision(unsigned int idx) const;
+
+  /**
+   * @brief check if the weight is mixed precsion
+   * @return bool true if it is mixed precision
+   */
+  bool isMixedPrecision() const;
+
   /**
    * @brief Get the tensor name
    *
@@ -910,6 +924,18 @@ class RunLayerContext {
    */
   float getLossScale() { return loss_scale; }
 
+  /**
+   * @brief   set Loss_Scale.
+   *
+   * @return loss_scale
+   */
+  void setLossScale(float scale) {
+    loss_scale = scale;
+    for (auto w : weights) {
+      w->setLossScale(scale);
+    }
+  }
+
 private:
   std::tuple<props::Name, props::Trainable> props; /**< props of the layer */
   float loss;                                      /**< loss of the layer */
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index d0e542825f..f7c0914d32 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -412,9 +412,21 @@ void NeuralNetwork::backwarding(int iteration,
   NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!";
 #endif
 
-  std::function<void(std::shared_ptr<LayerNode>, int)> backwarding_op =
+  std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op =
+    [this, stop_cb, userdata](std::shared_ptr<LayerNode> node,
+                              bool training) -> void {
+    (void)this;
+    PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
+
+    auto f = std::get<0>(node->getExecutionOrder());
+    model_graph.flushCacheExcept(f);
+
+    node->forwarding(training);
+  };
+
+  std::function<bool(std::shared_ptr<LayerNode>, int)> backwarding_op =
     [this, stop_cb, userdata](std::shared_ptr<LayerNode> node,
-                              int iteration) -> void {
+                              int iteration) -> bool {
     /**
      * Do not change this order:
      * 1. calcGradient
@@ -448,19 +460,29 @@ void NeuralNetwork::backwarding(int iteration,
       /** If gradient must be applied and its not gradient mode, calculate
        * gradient
        */
-      if (!dynamic_training_opt.isGradientMode() && apply_gradient)
+      if (!dynamic_training_opt.isGradientMode() && apply_gradient) {
         node->calcGradient();
+
+        RunLayerContext &rc = node->getRunContext();
+        if (rc.isMixedPrecision()) {
+          for (auto w : rc.getWeights()) {
+            if (w->getGradientRef().hasNaN())
+              return true;
+          }
+        }
+      }
     }
 
     model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder()));
     PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName());
 
     if (stop_cb(userdata)) {
-      return;
+      return false;
     }
 
-    if (node->needsCalcDerivative())
+    if (node->needsCalcDerivative()) {
       node->calcDerivative();
+    }
 
     model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder()));
     PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName());
@@ -476,9 +498,10 @@ void NeuralNetwork::backwarding(int iteration,
           opt_->applyGradient(opt_context);
         });
     }
+    return false;
   };
 
-  std::function<void(Weight &, int)> apply_grad_clip_op =
+  std::function<void(Weight &, int)> lazy_apply_grad_op =
     [opt_ = opt.get()](Weight &w, int iteration) -> void {
     w.calcRegularizationGradient();
     w.calcWeightDecayGradient();
@@ -487,8 +510,12 @@ void NeuralNetwork::backwarding(int iteration,
     opt_->applyGradient(opt_context);
   };
 
-  model_graph.backwarding(iteration, backwarding_op, apply_grad_clip_op,
-                          stop_cb, userdata);
+  bool ret = false;
+
+  while (!ret) {
+    ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op,
+                                  lazy_apply_grad_op, stop_cb, userdata);
+  }
 }
 
 void NeuralNetwork::save(const std::string &file_path,
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index 2fd4908463..ce4d8de47f 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -127,7 +127,7 @@ bool hasNaN(const size_t N, const _Float16 *input) {
     const __m256 vec0 =
       _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
     const __m256 vec1 =
-      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8)));
 
     input += 16;
 
@@ -161,6 +161,7 @@ bool hasNaN(const size_t N, const _Float16 *input) {
       return true;
     }
     ++input;
+    ++idx;
   }
 
   return false;
@@ -205,6 +206,7 @@ bool hasNaN(const size_t N, const float *input) {
       return true;
     }
     ++input;
+    ++idx;
   }
 
   return false;
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index df262f50d9..0e9879540a 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -153,7 +153,7 @@ void Weight::quantizeWeight() {
     // NYI
     break;
   case ml::train::TensorDim::DataType::FP16:
-    getVariableRef().copy(getVariableFP32Ref());
+    getVariableRef().copyData(getVariableFP32Ref());
     break;
   case ml::train::TensorDim::DataType::FP32:
     break;
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 5382c686e1..8ac3aa0190 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -349,6 +349,13 @@ class Weight : public Var_Grad {
    */
   void quantizeWeight();
 
+  /**
+   * @brief set loss scale
+   * param[in] scale
+   *
+   */
+  void setLossScale(float scale) { loss_scale = scale; };
+
 private:
   static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */
   static constexpr float epsilon_decay =

From a19b7df205b4b8845ca48aacd7c9541b9c41f2aa Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 14 May 2024 09:57:38 +0900
Subject: [PATCH 09/14] [ Tensor ] Add inifinity check in Tensor

This PR add inifinity value check in Tensor data.
. rename the hasNaN to isValid
. add infinity check in isValid Function and now it check NaN and Inf
. modify to check the blas_avx and blas_neon
. modify graph and model check is_valid rather than has_nan
. add unittest of isValid Function

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/graph/network_graph.cpp             |   8 +-
 nntrainer/models/neuralnet.cpp                |   9 +-
 nntrainer/tensor/blas_avx.cpp                 | 104 ++++++++++++++----
 nntrainer/tensor/blas_avx.h                   |  12 +-
 nntrainer/tensor/blas_interface.cpp           |  22 ++--
 nntrainer/tensor/blas_interface.h             |   8 +-
 nntrainer/tensor/blas_neon.cpp                |  52 ++++++---
 nntrainer/tensor/blas_neon.h                  |  16 +--
 nntrainer/tensor/tensor.cpp                   |   6 +-
 nntrainer/tensor/tensor.h                     |   6 +-
 test/unittest/unittest_nntrainer_tensor.cpp   |  24 ++++
 .../unittest_nntrainer_tensor_fp16.cpp        |  28 +++++
 .../unittest_nntrainer_tensor_neon_fp16.cpp   |  32 ++++++
 13 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index ac703e490b..821731e949 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -404,7 +404,7 @@ bool NetworkGraph::backwarding(
    */
   auto iter_begin = getBackwardingBeginIter();
   auto iter_end = getBackwardingEndIter();
-  bool has_nan = false;
+  bool is_valid = true;
 
   /// there is no layer to train, so backwarding is essentially noop
   if (iter_begin == iter_end) {
@@ -422,16 +422,16 @@ bool NetworkGraph::backwarding(
   for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
     auto &ln = *iter_;
     PROFILE_TIME_START(profile_keys.at(ln->getType()));
-    has_nan = backwarding_op(ln, iteration);
+    is_valid = backwarding_op(ln, iteration);
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
 
-    if (has_nan) {
+    if (!is_valid) {
       std::cout << "Gradient has NaN" << std::endl;
       break;
     }
   }
 
-  if (has_nan) {
+  if (!is_valid) {
     /** if has NaN
      * 1. reset the loss scale.
      * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index f7c0914d32..afc560603e 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -466,8 +466,8 @@ void NeuralNetwork::backwarding(int iteration,
         RunLayerContext &rc = node->getRunContext();
         if (rc.isMixedPrecision()) {
           for (auto w : rc.getWeights()) {
-            if (w->getGradientRef().hasNaN())
-              return true;
+            if (!w->getGradientRef().isValid())
+              return false;
           }
         }
       }
@@ -477,7 +477,7 @@ void NeuralNetwork::backwarding(int iteration,
     PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName());
 
     if (stop_cb(userdata)) {
-      return false;
+      return true;
     }
 
     if (node->needsCalcDerivative()) {
@@ -498,7 +498,7 @@ void NeuralNetwork::backwarding(int iteration,
           opt_->applyGradient(opt_context);
         });
     }
-    return false;
+    return true;
   };
 
   std::function<void(Weight &, int)> lazy_apply_grad_op =
@@ -510,6 +510,7 @@ void NeuralNetwork::backwarding(int iteration,
     opt_->applyGradient(opt_context);
   };
 
+  // return false if the gradient is not valid
   bool ret = false;
 
   while (!ret) {
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index ce4d8de47f..411dbcbb5d 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -115,101 +115,163 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
-bool hasNaN(const size_t N, const _Float16 *input) {
+bool isValid(const size_t N, const _Float16 *input) {
   assert(N != 0);
   assert(input != NULL);
 
   int temp = 0;
   size_t idx = 0;
 
+  const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+  const __m256 INF = _mm256_set1_ps(std::numeric_limits<float>::infinity());
+
   // 16 single-precision check : ( X != X )
   for (; N - idx >= 16; idx += 16) {
-    const __m256 vec0 =
-      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
-    const __m256 vec1 =
+    __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    __m256 vec1 =
       _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8)));
 
     input += 16;
 
+    // check NaN in vec0
     __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
+    if (temp)
+      return false;
+
+    // check infinity in vec0
+    vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+    vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
 
+    temp = temp | _mm256_movemask_ps(vec0);
     if (temp)
-      return true;
+      return false;
 
+    // check NaN in vec1
     __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res1);
 
     if (temp)
-      return true;
+      return false;
+
+    // check infinity in vec1
+    vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+    vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec1);
+
+    if (temp)
+      return false;
   }
 
   // 8 single-precision check : ( X != X )
   for (; N - idx >= 8; idx += 8) {
-    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
     input += 8;
     __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
 
     if (temp)
-      return true;
+      return false;
+
+    // check infinity in vec1
+    vec = _mm256_andnot_ps(SIGN_MASK, vec);
+    vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec);
+
+    if (temp)
+      return false;
   }
 
-  // remain check : ( X != X )
+  // remain check : ( X != X || X == Inf )
   while (idx < N) {
-    if (*input != *input) {
-      return true;
+    if (*input != *input || *input == std::numeric_limits<float>::infinity()) {
+      return false;
     }
     ++input;
     ++idx;
   }
 
-  return false;
+  return true;
 }
 #endif
 
-bool hasNaN(const size_t N, const float *input) {
+bool isValid(const size_t N, const float *input) {
   assert(N != 0);
   assert(input != NULL);
 
   int temp = 0;
   size_t idx = 0;
 
+  const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+  const __m256 INF = _mm256_set1_ps(std::numeric_limits<float>::infinity());
+
   // 16 single-precision check : ( X != X )
   for (; N - idx >= 16; idx += 16) {
-    const __m256 vec0 = _mm256_loadu_ps(input);
-    const __m256 vec1 = _mm256_loadu_ps(input + 8);
+    __m256 vec0 = _mm256_loadu_ps(input);
+    __m256 vec1 = _mm256_loadu_ps(input + 8);
     input += 16;
     __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec0
+    vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+    vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec0);
+    if (temp)
+      return false;
+
     __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res1);
 
     if (temp)
-      return true;
+      return false;
+
+    // check infinity in vec1
+    vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+    vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec1);
+
+    if (temp)
+      return false;
   }
 
   // 8 single-precision check : ( X != X )
   for (; N - idx >= 8; idx += 8) {
-    const __m256 vec = _mm256_loadu_ps(input);
+    __m256 vec = _mm256_loadu_ps(input);
     input += 8;
     __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
 
     if (temp)
-      return true;
+      return false;
+
+    // check infinity in vec
+    vec = _mm256_andnot_ps(SIGN_MASK, vec);
+    vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec);
+
+    if (temp)
+      return false;
   }
 
   // remain check : ( X != X )
   while (idx < N) {
-    if (*input != *input) {
-      return true;
+    if (*input != *input || *input == std::numeric_limits<float>::infinity()) {
+      return false;
     }
     ++input;
     ++idx;
   }
 
-  return false;
+  return true;
 }
 
 } // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
index d25ded103f..5eabcbdb2c 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/blas_avx.h
@@ -43,22 +43,22 @@ void vcvt_f32_f16(size_t N, const float *input, void *output);
 
 /**
  * @brief     check if the X has NaN value
- * @note it compare !(x==x)
+ * @note it compare (x!=x || x == inf)
  * @param[in] N  length of the vector
  * @param[in] X half-precision * for Vector X
- * @param[out] true if it has NaN
+ * @param[out] false if it has NaN or inf
  */
-bool hasNaN(const size_t N, const _Float16 *X);
+bool isValid(const size_t N, const _Float16 *X);
 #endif
 
 /**
  * @brief     check if the X has NaN value
- * @note it compare !(x==x)
+ * @note it compare (x!=x || x == inf)
  * @param[in] N  length of the vector
  * @param[in] X float * for Vector X
- * @param[out] true if it has NaN
+ * @param[out] false if it has NaN or inf
  */
-bool hasNaN(const size_t N, const float *X);
+bool isValid(const size_t N, const float *X);
 
 } // namespace nntrainer::avx
 
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 78cb708e53..6219919fd8 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -1038,14 +1038,14 @@ static void ele_div_fallback(const unsigned int N, const float *X,
   }
 }
 
-static bool has_nan_fallback(const size_t N, const float *X) {
+static bool is_valid_fallback(const size_t N, const float *X) {
   for (size_t i = 0; i < N; ++i) {
-    if (*X != *X)
-      return true;
+    if (*X != *X || *X == std::numeric_limits<float>::infinity())
+      return false;
     ++X;
   }
 
-  return false;
+  return true;
 }
 
 void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
@@ -1100,15 +1100,15 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
     ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
 }
 
-bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
-             const void *X) {
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+              const void *X) {
   if (d_type == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
     const _FP16 *vec = (const _FP16 *)X;
 #ifdef USE_NEON
-    return nntrainer::neon::hasNaN(N, vec);
+    return nntrainer::neon::isValid(N, vec);
 #elif defined(USE_AVX)
-    return nntrainer::avx::hasNaN(N, vec);
+    return nntrainer::avx::isValid(N, vec);
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
@@ -1116,12 +1116,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
   } else if (d_type == ml::train::TensorDim::DataType::FP32) {
     const float *vec = (const float *)X;
 #ifdef USE_NEON
-    return nntrainer::neon::hasNaN(N, vec);
+    return nntrainer::neon::isValid(N, vec);
 #elif defined(USE_AVX)
-    return nntrainer::avx::hasNaN(N, vec);
+    return nntrainer::avx::isValid(N, vec);
 #endif
 
-    return has_nan_fallback(N, vec);
+    return is_valid_fallback(N, vec);
   }
   return false;
 }
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index bcd557111e..2b5ef72922 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -480,13 +480,13 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              unsigned int o_stride = 1);
 
 /**
- * @brief     check if X array has NaN
+ * @brief     check if X array has NaN or inf
  * @param[in] N  length of the vector
  * @param[in] X float/fp16 * for Vector X
- * @param[out] bool true if NaN else false
+ * @param[out] bool false if not valide else true
  */
-bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
-             const void *X);
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+              const void *X);
 
 } /* namespace nntrainer */
 #endif /* __cplusplus */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 5062bd7c35..20f4d102ec 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -546,23 +546,34 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
   }
 }
 
-bool hasNaN(const size_t N, const float *X) {
-  bool temp = false;
+bool isValid(const size_t N, const float *X) {
   size_t i = 0;
+  float inf_s = std::numeric_limits<float>::infinity();
+  float32x4_t inf = vdupq_n_f32(inf_s);
+  uint16x8_t zero = vdupq_n_f32(0);
+
   for (; N - i >= 4; i += 4) {
     float32x4_t vec = vld1q_f32(&X[i]);
     uint32x4_t vcmp = vceqq_f32(vec, vec);
+
+    vcmp = vceqq_f32(vcmp, zero);
+
     if (vaddvq_u32(vcmp))
-      return true;
+      return false;
+
+    vcmp = vceqq_f32(vec, inf);
+
+    if (vaddvq_u16(vcmp))
+      return false;
   }
 
   while (i < N) {
-    if (X[i] != X[i])
-      return true;
+    if (X[i] != X[i] || X[i] == std::numeric_limits<float>::infinity())
+      return false;
     ++i;
   }
 
-  return temp;
+  return true;
 }
 
 #ifdef ENABLE_FP16
@@ -1991,24 +2002,39 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
   }
 }
 
-bool hasNaN(const size_t N, const __fp16 *input) {
+bool isValid(const size_t N, const __fp16 *input) {
   bool temp = 0;
   size_t i = 0;
+  __fp16 inf_s = std::numeric_limits<float>::infinity();
+  float16x8_t inf = vdupq_n_f16(inf_s);
+  uint16x8_t zero = vdupq_n_f16(0);
+
   for (; N - i >= 8; i += 8) {
     float16x8_t vec = vld1q_f16(&input[i]);
+
     uint16x8_t vcmp = vceqq_f16(vec, vec);
 
-    if (vaddvq_u16(vcmp))
-      return true;
+    vcmp = vceqq_f16(vcmp, zero);
+
+    if (vaddvq_u16(vcmp)) {
+      return false;
+    }
+
+    vcmp = vceqq_f16(vec, inf);
+
+    if (vaddvq_u16(vcmp)) {
+      return false;
+    }
   }
 
   while (i < N) {
-    if (input[i] != input[i])
-      return true;
+    if (input[i] != input[i] ||
+        input[i] == std::numeric_limits<float>::infinity()) {
+      return false;
+    }
     ++i;
   }
-
-  return temp;
+  return true;
 }
 
 #endif
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index 6da5e952e1..978d3428f7 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -149,13 +149,13 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f);
 
 /**
- * @brief     check if the X has NaN value
- * @note it compare !(x==x)
+ * @brief     check if the X has NaN value or Inf
+ * @note it compare (x!=x || x == inf)
  * @param[in] N  length of the vector
  * @param[in] input float * for Vector X
- * @param[out] true if it has NaN
+ * @param[out] false if it has NaN or Inf
  */
-bool hasNaN(const size_t N, const float *input);
+bool isValid(const size_t N, const float *input);
 
 #ifdef ENABLE_FP16
 /**
@@ -391,13 +391,13 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
 void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
 
 /**
- * @brief     check if the X has NaN value
- * @note it compare !(x==x)
+ * @brief     check if the X is valid: Check NaN or Inf
+ * @note it compare (x!=x || x == inf)
  * @param[in] N  length of the vector
  * @param[in] X float * for Vector X
- * @param[out] true if it has NaN
+ * @param[out] false if it has NaN or Inf
  */
-bool hasNaN(const size_t N, const __fp16 *X);
+bool isValid(const size_t N, const __fp16 *X);
 #endif
 
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index f9db2e2ab0..827ba7e979 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3820,15 +3820,15 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
   return;
 }
 
-bool Tensor::hasNaN() const {
+bool Tensor::isValid() const {
   if (getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-    return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
+    return is_valid(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
 #else
     throw std::invalid_argument("enble-fp16 is not set");
 #endif
   } else {
-    return has_nan(dim.getDataLen(), Tdatatype::FP32, getData<float>());
+    return is_valid(dim.getDataLen(), Tdatatype::FP32, getData<float>());
   }
 }
 
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 968ec4d502..ad3781526f 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -2039,10 +2039,10 @@ class Tensor {
   static constexpr float epsilon = 1e-5;
 
   /**
-   * @brief      check if there is NaN element
-   * @param[out] bool true if there is NaN else false
+   * @brief      check if there is NaN or Inf element
+   * @param[out] bool false if there is NaN or Inf else false
    */
-  bool hasNaN() const;
+  bool isValid() const;
 
 private:
   /**< handle the data as a std::shared_ptr<float> type */
diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp
index 94aa01836d..d5b6a028f9 100644
--- a/test/unittest/unittest_nntrainer_tensor.cpp
+++ b/test/unittest/unittest_nntrainer_tensor.cpp
@@ -4704,6 +4704,30 @@ TEST(nntrainer_Tensor, inv_sqrt_i_uncontiguous_p) {
   }
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+}
+
 int main(int argc, char **argv) {
   int result = -1;
 
diff --git a/test/unittest/unittest_nntrainer_tensor_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
index 2b0d9c040d..58455757c5 100644
--- a/test/unittest/unittest_nntrainer_tensor_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
@@ -6196,6 +6196,34 @@ TEST(nntrainer_Tensor, dequantize_06_p) {
   EXPECT_EQ(output, answer3);
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, std::numeric_limits<float>::infinity());
+
+  EXPECT_EQ(input.isValid(), false);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 
diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
index d62be87aaf..799a910273 100644
--- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
@@ -1180,6 +1180,38 @@ TEST(nntrainer_Tensor, inv_sqrt_i_p) {
   EXPECT_EQ(flag, true);
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, std::numeric_limits<float>::infinity());
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, 1);
+
+  EXPECT_EQ(input.isValid(), true);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 

From 139ee5a6eb43ddf707ec990e821f9d17192bbb1a Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Fri, 17 May 2024 13:31:55 +0900
Subject: [PATCH 10/14] [ MSE ] Fix for better MSE loss precision

This PR chage the loss computation using full precsion rather than
half precsion to maintain accuracy.

**Changes proposed in this PR:**
- Added TOC generator for README.md

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/layers/loss/mse_loss_layer.cpp | 34 ++++++++++++++++++------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index 3aed8125e0..ed4390655d 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -51,17 +51,27 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
 
 void MSELossLayer::calcDerivative(RunLayerContext &context) {
   Tensor empty_tensor;
-  Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
-  const Tensor &y2_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
-  Tensor &y2 = empty_tensor;
 
-  if (ret_derivative.getDataType() == ml::train::TensorDim::DataType::FP32)
-    y2 = y2_;
+  Tensor &ret_derivative =
+    context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
+        ml::train::TensorDim::DataType::FP32
+      ? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+      : empty_tensor;
 
-  if (y2.empty())
-    y2 = y2_.clone(ret_derivative.getDataType());
+  if (ret_derivative.empty())
+    ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+                       .clone(ml::train::TensorDim::DataType::FP32);
 
-  Tensor &y = context.getInput(SINGLE_INOUT_IDX);
+  Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+                  ml::train::TensorDim::DataType::FP32
+                ? context.getInput(SINGLE_INOUT_IDX)
+                : empty_tensor;
+
+  if (y.empty())
+    y = context.getInput(SINGLE_INOUT_IDX)
+          .clone(ml::train::TensorDim::DataType::FP32);
+
+  const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
 
   y.subtract(y2, ret_derivative);
   float divider = ((float)y.size()) / 2;
@@ -70,7 +80,15 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
       "[MSELossLayer::calcDerivative] Error when calculating loss");
   }
 
+  // Loss Scale needs Full precsiion of ret_derivative. Therefore,
+  // ret_derivateive should be FP32 when applying scale, and after applying it
+  // need to convert original type for backpropagating.
+
   LossLayer::applyLossScale(context, ret_derivative);
+
+  if (context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() !=
+      ml::train::TensorDim::DataType::FP32)
+    context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(ret_derivative);
 }
 
 } // namespace nntrainer

From 0d52e0de8bac71996f8bc0f73df7546369b76dee Mon Sep 17 00:00:00 2001
From: Jiho Chu <jiho.chu@samsung.com>
Date: Wed, 6 Mar 2024 11:12:02 +0900
Subject: [PATCH 11/14] [Test] Add conv2d test for fp16

It adds tests for conv2d fp16 test.

Signed-off-by: Jiho Chu <jiho.chu@samsung.com>
---
 nntrainer/optimizers/adam.cpp                 |  56 ++++--
 nntrainer/optimizers/optimizer_context.cpp    |  15 ++
 nntrainer/optimizers/optimizer_context.h      |  15 ++
 .../layers/unittest_layers_convolution2d.cpp  | 182 ++++++++++++++++++
 4 files changed, 257 insertions(+), 11 deletions(-)

diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
index 530e7fdf31..e2d611b7fd 100644
--- a/nntrainer/optimizers/adam.cpp
+++ b/nntrainer/optimizers/adam.cpp
@@ -95,26 +95,60 @@ void Adam::applyGradient(RunOptimizerContext &context) {
   Tensor &wm = context.getOptimizerVariable(AdamParams::wm);
   Tensor &wv = context.getOptimizerVariable(AdamParams::wv);
 
-  wm.multiply_i(beta1);
-  wm.add_i(x_grad, 1.0f - beta1);
+  if (context.getNumOptMasterVariable() != 0) {
+    Tensor &wm_m = context.getOptimizerMasterVariable(AdamParams::wm);
+    Tensor &wv_m = context.getOptimizerMasterVariable(AdamParams::wv);
+    Tensor x_grad_ = x_grad.clone(wm_m.getDataType());
 
-  wv.multiply_i(beta2);
-  wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2);
+    wm_m.multiply_i(beta1);
+    wm_m.add_i(x_grad_, 1.0f - beta1);
+
+    wv_m.multiply_i(beta2);
+    wv_m.add_i(x_grad_.multiply(x_grad_), 1.0f - beta2);
+
+    wm.copyData(wm_m);
+    wv.copyData(wv_m);
+  } else {
+    wm.multiply_i(beta1);
+    wm.add_i(x_grad, 1.0f - beta1);
+
+    wv.multiply_i(beta2);
+    wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2);
+  }
 
   if (torch_ref) {
-    Tensor denom = wv.apply<float>(sqrtFloat<float>);
-    denom.divide_i(sqrtFloat(biasCorrection2));
-    denom.add_i(epsilon);
-    wm.divide(denom, x_grad);
+    if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) {
+      Tensor denom = wv.apply<float>(sqrtFloat<float>);
+      denom.divide_i(sqrtFloat(biasCorrection2));
+      denom.add_i(epsilon);
+      wm.divide(denom, x_grad);
+#ifdef ENABLE_FP16
+    } else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) {
+      Tensor denom = wv.apply<_FP16>(sqrtFloat<_FP16>);
+      denom.divide_i(sqrtFloat(biasCorrection2));
+      denom.add_i(epsilon);
+      wm.divide(denom, x_grad);
+#endif
+    } else {
+      throw std::runtime_error("Not supported datatype");
+    }
 
     context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
 
   } else {
-    std::function<double(double)> sqrtEps = [epsilon](double f) {
-      return 1 / (sqrtDouble(f) + epsilon);
+    auto sqrtEps = [epsilon]<typename T>(T f) -> T {
+      return 1 / (static_cast<T>(sqrtDouble(f)) + static_cast<T>(epsilon));
     };
 
-    x_grad = wv.apply<float>(sqrtEps, x_grad);
+    if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32)
+      x_grad = wv.apply<float>(sqrtEps, x_grad);
+#ifdef ENABLE_FP16
+    else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16)
+      x_grad = wv.apply<_FP16>(sqrtEps, x_grad);
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+
     x_grad.multiply_i(wm);
     context.applyGradient(
       getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
index f70ab773a9..f0817c2a75 100644
--- a/nntrainer/optimizers/optimizer_context.cpp
+++ b/nntrainer/optimizers/optimizer_context.cpp
@@ -36,6 +36,21 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
   return weight->getOptimizerVariableRef(idx);
 }
 
+/**
+ * @brief Get the optimizer variable associated to this weight
+ */
+Tensor &
+RunOptimizerContext::getOptimizerMasterVariable(unsigned int idx) const {
+  return weight->getOptimizerMasterVariableRef(idx);
+}
+
+/**
+ * @brief Get number of optimizer master variable
+ */
+int RunOptimizerContext::getNumOptMasterVariable() {
+  return weight->getNumOptMasterVariable();
+}
+
 /**
  * @brief   Apply the gradient with the given learning rate
  */
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
index 6b4b983e35..ca30c36b94 100644
--- a/nntrainer/optimizers/optimizer_context.h
+++ b/nntrainer/optimizers/optimizer_context.h
@@ -59,6 +59,21 @@ class RunOptimizerContext {
    */
   Tensor &getOptimizerVariable(unsigned int idx) const;
 
+  /**
+   * @brief Get the optimizer Master variable associated to this weight
+   *
+   * @param idx Identifier of the associated weight
+   * @return Tensor& Reference to the optimizer variable
+   */
+  Tensor &getOptimizerMasterVariable(unsigned int idx) const;
+
+  /**
+   * @brief Get number of the optimizer Master variable
+   *
+   * @return number of optimizer master variable
+   */
+  int getNumOptMasterVariable();
+
   /**
    * @brief   Check if run context is set and is ready to use
    *
diff --git a/test/unittest/layers/unittest_layers_convolution2d.cpp b/test/unittest/layers/unittest_layers_convolution2d.cpp
index 724c79079b..92d9c593e7 100644
--- a/test/unittest/layers/unittest_layers_convolution2d.cpp
+++ b/test/unittest/layers/unittest_layers_convolution2d.cpp
@@ -198,3 +198,185 @@ GTEST_PARAMETER_TEST(
     conv2d_mb_valid_drop_last, conv2d_sb_no_overlap, conv2d_mb_no_overlap,
     conv2d_sb_1x1_kernel, conv2d_mb_1x1_kernel, conv2d_sb_dilation,
     conv2d_mb_dilation, conv2d_sb_same_dilation, conv2d_mb_same_dilation));
+
+#ifdef ENABLE_FP16
+auto conv2d_sb_minimum_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2"}, "1:1:4:4",
+  "conv2d_sb_minimum_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_minimum_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2"}, "3:1:4:4",
+  "conv2d_mb_minimum_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_remain_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=2", "kernel_size=3,3", "padding=same"}, "1:1:4:4",
+  "conv2d_sb_same_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_remain_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=2", "kernel_size=3,3", "padding=same"}, "3:1:4:4",
+  "conv2d_mb_same_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=same",
+  },
+  "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=0,1,0,1",
+  },
+  "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=same",
+  },
+  "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=0,1,0,1",
+  },
+  "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_valid_drop_last_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=valid",
+  },
+  "1:3:7:7", "conv2d_sb_valid_drop_last_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_valid_drop_last_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=valid",
+  },
+  "3:3:7:7", "conv2d_mb_valid_drop_last_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_no_overlap_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2", "stride=3,3"}, "1:2:5:5",
+  "conv2d_sb_no_overlap_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_no_overlap_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=3",
+    "kernel_size=2,2",
+    "stride=3,3",
+  },
+  "3:2:5:5", "conv2d_mb_no_overlap_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_1x1_kernel_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=1,1", "stride=2,2"}, "1:2:5:5",
+  "conv2d_sb_1x1_kernel_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_1x1_kernel_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=3",
+    "kernel_size=1,1",
+    "stride=2,2",
+  },
+  "3:2:5:5", "conv2d_mb_1x1_kernel_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "dilation=2,2",
+  },
+  "1:3:11:11", "conv2d_sb_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "dilation=2,2",
+  },
+  "3:3:11:11", "conv2d_mb_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "padding=same",
+    "dilation=2,2",
+  },
+  "1:3:11:11", "conv2d_sb_same_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "padding=same",
+    "dilation=2,2",
+  },
+  "3:3:11:11", "conv2d_mb_same_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+GTEST_PARAMETER_TEST(
+  Convolution2D16, LayerGoldenTest,
+  ::testing::Values(conv2d_sb_minimum_w16a16, conv2d_mb_minimum_w16a16,
+                    conv2d_sb_same_remain_w16a16, conv2d_mb_same_remain_w16a16,
+                    conv2d_sb_same_uneven_remain_1_w16a16,
+                    conv2d_sb_same_uneven_remain_2_w16a16,
+                    conv2d_mb_same_uneven_remain_1_w16a16,
+                    conv2d_mb_same_uneven_remain_2_w16a16,
+                    conv2d_sb_valid_drop_last_w16a16,
+                    conv2d_mb_valid_drop_last_w16a16,
+                    conv2d_sb_no_overlap_w16a16, conv2d_mb_no_overlap_w16a16,
+                    conv2d_sb_1x1_kernel_w16a16, conv2d_mb_1x1_kernel_w16a16,
+                    conv2d_sb_dilation_w16a16, conv2d_mb_dilation_w16a16,
+                    conv2d_sb_same_dilation_w16a16,
+                    conv2d_mb_same_dilation_w16a16));
+#endif

From 72917b84bdcf1860f944e43a909c1dbf93f849ba Mon Sep 17 00:00:00 2001
From: Jiho Chu <jiho.chu@samsung.com>
Date: Thu, 14 Mar 2024 19:31:50 +0900
Subject: [PATCH 12/14] [Layers] Modify layers for data type

It is assumed that activations and weight are fully compotaible,
so it's unnecessary to be converted to.
input layer and loss layres are different, cause input data and label
data is assumed to be always float 32 type now.

Signed-off-by: Jiho Chu <jiho.chu@samsung.com>
---
 nntrainer/layers/bn_layer.cpp                 |  12 +-
 nntrainer/layers/conv2d_layer.cpp             | 160 +++++----
 nntrainer/layers/layer_context.cpp            |  51 +++
 nntrainer/layers/layer_context.h              |  38 +++
 nntrainer/layers/layer_devel.h                |   5 +
 nntrainer/layers/layer_node.cpp               |  27 +-
 nntrainer/layers/layer_node.h                 |   5 +
 .../loss/cross_entropy_sigmoid_loss_layer.cpp |   3 +
 .../loss/cross_entropy_softmax_loss_layer.cpp |  36 +-
 nntrainer/layers/loss/loss_layer.cpp          |   3 +
 nntrainer/layers/loss/loss_layer.h            |  13 +
 nntrainer/layers/loss/meson.build             |   4 +-
 nntrainer/layers/loss/mse_loss_layer.cpp      |  75 ++++-
 nntrainer/layers/lstm.cpp                     |  42 ++-
 nntrainer/layers/lstm.h                       |   1 -
 nntrainer/layers/pooling2d_layer.cpp          | 310 +++++++++++-------
 nntrainer/layers/reshape_layer.cpp            |   1 +
 17 files changed, 570 insertions(+), 216 deletions(-)

diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp
index 1723ac677f..e978b1ef59 100644
--- a/nntrainer/layers/bn_layer.cpp
+++ b/nntrainer/layers/bn_layer.cpp
@@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
     context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
                           bias_decay, "beta", true);
 
+  /**
+   * @note declare weigth dimention with activation datatype
+   */
+  TensorDim w_dim = dim;
+  w_dim.setDataType(in_dim.getDataType());
+
   /**
    * caches the deviation -> input - avg(input)
    * @todo check if avoiding this storage and adding dependency on input (no
@@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
                           TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[BNParams::invstd] =
-    context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the full sized tensors in order to allow batch
@@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * caches variance + epsilon as well.
    */
   wt_idx[BNParams::cvar] =
-    context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the reduced tensors along the axes_to_reduce.
    */
   wt_idx[BNParams::t_reduced] =
-    context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE,
+    context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE,
                           false, TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp
index c059ae9caf..5d9dbc1e19 100644
--- a/nntrainer/layers/conv2d_layer.cpp
+++ b/nntrainer/layers/conv2d_layer.cpp
@@ -38,7 +38,8 @@ namespace {
 static TensorDim calcCol2ImOutputDim(const TensorDim &out,
                                      const TensorDim &kdim) {
 
-  return TensorDim({kdim.getFeatureLen(), out.width() * out.height()});
+  return TensorDim({kdim.getFeatureLen(), out.width() * out.height()},
+                   out.getTensorType());
 }
 
 /**
@@ -56,7 +57,10 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
                    const std::array<props::Stride, CONV2D_DIM> &mstride,
                    const std::array<props::Dilation, CONV2D_DIM> &dilation,
                    Tensor &image) {
-  auto [pt, pb, pl, pr] = padding;
+  auto pt = padding[0];
+  auto pb = padding[1];
+  auto pl = padding[2];
+  auto pr = padding[3];
 
   unsigned k_height = kdim.height();
   unsigned k_width = kdim.width();
@@ -84,32 +88,48 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
   int h_stride_end = im_eff_height - eff_k_height - pt;
   int w_stride_end = im_eff_width - eff_k_width - pl;
 
-  unsigned col_w = 0;
-  for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
-    for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
-      unsigned col_h = 0;
-      int patch_height_end = hs + eff_k_height;
-      int patch_width_end = ws + eff_k_width;
-      for (unsigned c = 0; c < im_channel; c++) {
-        for (int h = hs; h < patch_height_end; h += hdilation) {
-          if (h < 0 || im_height <= h) {
-            col_h += k_width;
-            continue;
-          }
-          for (int w = ws; w < patch_width_end; w += wdilation) {
-            if (w < 0 || im_width <= w) {
-              col_h++;
+  auto apply_data = [&]<typename T>(T *val) {
+    unsigned col_w = 0;
+    for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
+      for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
+        unsigned col_h = 0;
+        int patch_height_end = hs + eff_k_height;
+        int patch_width_end = ws + eff_k_width;
+        for (unsigned c = 0; c < im_channel; c++) {
+          for (int h = hs; h < patch_height_end; h += hdilation) {
+            if (h < 0 || im_height <= h) {
+              col_h += k_width;
               continue;
             }
-
-            float *val = image.getAddress<float>(0, c, h, w);
-            *val += col_matrix.getValue<float>(0, 0, col_h, col_w);
-            col_h++;
+            for (int w = ws; w < patch_width_end; w += wdilation) {
+              if (w < 0 || im_width <= w) {
+                col_h++;
+                continue;
+              }
+
+              val = image.getAddress<T>(0, c, h, w);
+              *val += col_matrix.getValue<T>(0, 0, col_h, col_w);
+              col_h++;
+            }
           }
         }
+        col_w++;
       }
-      col_w++;
     }
+  };
+
+  if (image.getDataType() == nntrainer::Tdatatype::FP32) {
+    float val;
+    apply_data(&val);
+  }
+#ifdef ENABLE_FP16
+  else if (image.getDataType() == nntrainer::Tdatatype::FP16) {
+    _FP16 val;
+    apply_data(&val);
+  }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
   }
 }
 
@@ -179,7 +199,10 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
   //   }
   */
 
-  auto [pt, pb, pl, pr] = padding;
+  auto pt = padding[0];
+  auto pb = padding[1];
+  auto pl = padding[2];
+  auto pr = padding[3];
 
   unsigned int channel = in.channel();
   int in_height = in.height();
@@ -198,46 +221,62 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
   unsigned int out_width = (width - eff_k_width) / mstride[1] + 1;
 
   out.reshape(
-    TensorDim({out_height * out_width, in.channel() * k_height * k_width}));
-  float *out_data = out.getData();
-
-  int h_stride_end = height - eff_k_height - pt;
-  int w_stride_end = width - eff_k_width - pl;
-
-  /// get a patch, size of kernel
-  /// hs is height_strided, ws is width_strided
-  unsigned int owidth = out.width();
-  unsigned int base_im_w = 0;
-  for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
-    unsigned int base_im_h = 0;
-    int patch_height_end = eff_k_height + hs;
-    /// map the patch to a single line looping through channel
-    for (unsigned int c = 0; c < channel; ++c) {
-      for (int h = hs; h < patch_height_end; h += dilation[0]) {
-        if (h < 0 || in_height <= h) {
-          base_im_h += k_width;
-          continue;
-        }
-
-        unsigned int im_w = base_im_w;
-        for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
-          unsigned int im_h = base_im_h;
-          int patch_width_end = eff_k_width + ws;
+    TensorDim({out_height * out_width, in.channel() * k_height * k_width},
+              in.getTensorType()));
+
+  auto apply_data = [&]<typename T>(T *out_data) {
+    int h_stride_end = height - eff_k_height - pt;
+    int w_stride_end = width - eff_k_width - pl;
+
+    /// get a patch, size of kernel
+    /// hs is height_strided, ws is width_strided
+    unsigned int owidth = out.width();
+    unsigned int base_im_w = 0;
+    for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
+      unsigned int base_im_h = 0;
+      int patch_height_end = eff_k_height + hs;
+      /// map the patch to a single line looping through channel
+      for (unsigned int c = 0; c < channel; ++c) {
+        for (int h = hs; h < patch_height_end; h += dilation[0]) {
+          if (h < 0 || in_height <= h) {
+            base_im_h += k_width;
+            continue;
+          }
 
-          for (int w = ws; w < patch_width_end; w += dilation[1]) {
-            if (w < 0 || in_width <= w) {
+          unsigned int im_w = base_im_w;
+          for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
+            unsigned int im_h = base_im_h;
+            int patch_width_end = eff_k_width + ws;
+
+            for (int w = ws; w < patch_width_end; w += dilation[1]) {
+              if (w < 0 || in_width <= w) {
+                im_h++;
+                continue;
+              }
+              out_data[im_w * owidth + im_h] = in.getValue<T>(0, c, h, w);
               im_h++;
-              continue;
             }
-            out_data[im_w * owidth + im_h] = in.getValue<float>(0, c, h, w);
-            im_h++;
+            im_w++;
           }
-          im_w++;
+          base_im_h += k_width;
         }
-        base_im_h += k_width;
       }
+      base_im_w += out_width;
     }
-    base_im_w += out_width;
+  };
+
+  if (out.getDataType() == nntrainer::Tdatatype::FP32) {
+    float *out_data = out.getData<float>();
+    apply_data(out_data);
+  }
+#ifdef ENABLE_FP16
+  else if (out.getDataType() == nntrainer::Tdatatype::FP16) {
+    _FP16 *out_data = out.getData<_FP16>();
+    apply_data(out_data);
+  }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
   }
 }
 
@@ -279,9 +318,11 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
   auto &dilation =
     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
 
-  TensorDim kernel_dim =
-    TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]);
-  TensorDim bias_dim = TensorDim(1, filter_size, 1, 1);
+  auto in_t_type = in_dim.getTensorType();
+  in_t_type.data_type = context.getWeightDataType();
+  TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(),
+                                   kernel_size[0], kernel_size[1], in_t_type);
+  TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type);
 
   padding = std::get<props::Padding2D>(conv_props)
               .compute(in_dim, kernel_dim, {stride[0], stride[1]},
@@ -309,6 +350,7 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
   out_dim.channel(filter_size);
   out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1);
   out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1);
+  out_dim.setTensorType(in_dim.getTensorType());
   context.setOutputDimensions({out_dim});
 
   NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1],
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index 5862e6af14..648eae30ee 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -157,6 +157,16 @@ Tensor &RunLayerContext::getWeight(unsigned int idx) const {
   return weights[idx]->getVariableRef();
 }
 
+/**
+ * @brief Get the Weight tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight tensor
+ */
+Tensor *RunLayerContext::getWeightMaster(unsigned int idx) const {
+  return weights[idx]->getVariableMasterRef();
+}
+
 /**
  * @brief Get the Weight Gradient tensor object
  *
@@ -195,6 +205,18 @@ Tensor &RunLayerContext::getWeightOptVar(unsigned int idx,
   return weights[idx]->getOptimizerVariableRef(jdx);
 }
 
+/**
+ * @brief Get the Weight Optimizer Variable tensor object
+ *
+ * @param idx Identifier of the weight
+ * @param jdx Identifier of the optimizer variables
+ * @return Tensor& Reference to the weight optimizer variable tensor
+ */
+Tensor &RunLayerContext::getWeightOptMasterVar(unsigned int idx,
+                                               unsigned int jdx) const {
+  return weights[idx]->getOptimizerMasterVariableRef(jdx);
+}
+
 /**
  * @brief Get the Number of Weight Optimizer Variable tensor object
  *
@@ -205,6 +227,16 @@ unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const {
   return weights[idx]->getNumOptVariable();
 }
 
+/**
+ * @brief Get the Number of Weight Optimizer Variable tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return int Number of the weight optimizer variable
+ */
+unsigned int RunLayerContext::getNumWeightOptMasterVar(unsigned int idx) const {
+  return weights[idx]->getNumOptMasterVariable();
+}
+
 /**
  * @brief Get regularization loss for the weight
  *
@@ -344,6 +376,25 @@ Tensor &RunLayerContext::getOutgoingDerivative(unsigned int idx) {
   return getInputGrad(idx);
 }
 
+bool RunLayerContext::validateDerivatives() {
+  auto num_in = getNumInputs();
+  auto num_out = getNumOutputs();
+
+  for (unsigned int i = 0; i < num_in; ++i) {
+    auto deriv = getIncomingDerivative(i);
+    if (deriv.checkDataValidation(false) == false)
+      return false;
+  }
+
+  for (unsigned int i = 0; i < num_out; ++i) {
+    auto deriv = getOutgoingDerivative(i);
+    if (deriv.checkDataValidation(false) == false)
+      return false;
+  }
+
+  return true;
+}
+
 /**
  * @brief Get the Tensor object
  *
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index c68c42f11d..3e2e3d0339 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -474,6 +474,14 @@ class RunLayerContext {
    */
   Tensor &getWeight(unsigned int idx) const;
 
+  /**
+   * @brief Get the Weight master tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return Tensor& Reference to the weight tensor
+   */
+  Tensor *getWeightMaster(unsigned int idx) const;
+
   /**
    * @brief Get the Weight Gradient tensor object
    *
@@ -501,6 +509,15 @@ class RunLayerContext {
    */
   Tensor &getWeightOptVar(unsigned int idx, unsigned int jdx) const;
 
+  /**
+   * @brief Get the Weight Optimizer Master Variable tensor object
+   *
+   * @param idx Identifier of the weight
+   * @param jdx Identifier of the weight optimizer master variable
+   * @return Tensor& Reference to the weight optimizer tensor
+   */
+  Tensor &getWeightOptMasterVar(unsigned int idx, unsigned int jdx) const;
+
   /**
    * @brief Get the Weight name
    *
@@ -611,6 +628,11 @@ class RunLayerContext {
    */
   Tensor &getOutgoingDerivative(unsigned int idx);
 
+  /**
+   * @brief validate input/output derivatives of the layer
+   */
+  bool validateDerivatives();
+
   /**
    * @brief Get the Tensor object
    *
@@ -740,6 +762,14 @@ class RunLayerContext {
    */
   unsigned int getNumWeightOptVar(unsigned int idx) const;
 
+  /**
+   * @brief Get the Number of Weight Optimizer Variable tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return unsigned int Number of the weight optimizer variable
+   */
+  unsigned int getNumWeightOptMasterVar(unsigned int idx) const;
+
   /**
    * @brief Get the number of requested tensors objects
    *
@@ -747,6 +777,14 @@ class RunLayerContext {
    */
   unsigned int getNumTensors() const { return tensors.size(); }
 
+  /**
+   * @brief Set the Weight Optimizer Variable tensor object
+   *
+   * @param idx Identifier of the weight
+   * @param jdx Identifier of the weight optimizer variable
+   */
+  void setWeightOptVars(unsigned int idx, std::vector<Tensor *> opts);
+
   /**
    * @brief Set the batch for the run context
    *
diff --git a/nntrainer/layers/layer_devel.h b/nntrainer/layers/layer_devel.h
index 54ce1a0ee9..44a87cc7e9 100644
--- a/nntrainer/layers/layer_devel.h
+++ b/nntrainer/layers/layer_devel.h
@@ -259,6 +259,11 @@ class Layer {
    * @return true if supports backwarding, else false
    */
   virtual bool supportBackwarding() const = 0;
+
+  /**
+   * @brief Set loss scale factor
+   */
+  virtual void setLossScale(float scale) {}
 };
 
 /// @todo Decide where to put and how to implement(#986)
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index f41752a4d8..6eb4b279de 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <iterator>
 #include <stdexcept>
+#include <tuple>
 #include <utility>
 
 #include <activation_layer.h>
@@ -465,8 +466,12 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
         /// @note read optimizer variables
+        auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i);
         for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
-          run_context->getWeightOptVar(i, j).read(file);
+          if (num_w_opt_m > 0)
+            run_context->getWeightOptMasterVar(i, j).read(file);
+          else
+            run_context->getWeightOptVar(i, j).read(file);
         }
       }
     }
@@ -474,7 +479,11 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       /// @note shared weights are only be read at the first acecss
       if (run_context->isGradientLastAccess(i)) {
-        run_context->getWeight(i).read(file);
+        auto w = run_context->getWeightMaster(i);
+        if (w)
+          w->read(file);
+        else
+          run_context->getWeight(i).read(file);
       }
     }
   }
@@ -489,9 +498,13 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
         // @note save optimizer variables
         if (run_context->weightHasGradient(i)) {
+          auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i);
           for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
                ++j) {
-            run_context->getWeightOptVar(i, j).save(file);
+            if (num_w_opt_m > 0)
+              run_context->getWeightOptMasterVar(i, j).save(file);
+            else
+              run_context->getWeightOptVar(i, j).save(file);
           }
         }
       }
@@ -500,7 +513,13 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
     // @note shared weights are only be saved at the first access
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i)) {
-        run_context->getWeight(i).save(file);
+        if (run_context->getNumWeights()) {
+          auto w = run_context->getWeightMaster(i);
+          if (w)
+            w->save(file);
+          else
+            run_context->getWeight(i).save(file);
+        }
       }
     }
   }
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 3fd2d55b97..6be31f536e 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -900,6 +900,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
    */
   bool needsCalcGradient() { return needs_calc_gradient; }
 
+  /**
+   * @brief Set loss scale factor
+   */
+  void setLossScale(float scale) { layer->setLossScale(scale); }
+
 private:
   /**
    * @brief     Get the Input Layers object
diff --git a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
index 60ea113418..feeff2b3d8 100644
--- a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
+++ b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
@@ -61,6 +61,9 @@ void CrossEntropySigmoidLossLayer::calcDerivative(RunLayerContext &context) {
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   y.apply<float>(ActiFunc::sigmoid<float>, ret_derivative);
+
+  applyLossScale(ret_derivative);
+
   ret_derivative.subtract_i(y2);
   if (ret_derivative.divide_i(ret_derivative.size()) != ML_ERROR_NONE) {
     throw std::runtime_error("[CrossEntropySigmoidLossLayer::calcDerivative] "
diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
index 53854662ae..c181c60b9a 100644
--- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
+++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
@@ -30,9 +30,14 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context,
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   // fill the output
-  auto dataType = y.getDataType();
-  if (dataType == ml::train::TensorDim::DataType::FP32) {
-    hidden_ = y.apply(ActiFunc::softmax<float>, hidden_);
+  auto out_type = hidden_.getDataType();
+  if (out_type == ml::train::TensorDim::DataType::FP32) {
+    if (y.getDataType() != out_type) {
+      Tensor y_ = y.clone(out_type);
+      hidden_ = y_.apply(ActiFunc::softmax<float>, hidden_);
+    } else {
+      hidden_ = y.apply(ActiFunc::softmax<float>, hidden_);
+    }
 
     if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
       Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
@@ -43,9 +48,14 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context,
       // update the loss value
       LossLayer::updateLoss(context, l);
     }
-  } else if (dataType == ml::train::TensorDim::DataType::FP16) {
+  } else if (out_type == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
-    hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_);
+    if (y.getDataType() != out_type) {
+      Tensor y_ = y.clone(out_type);
+      hidden_ = y_.apply(ActiFunc::softmax<_FP16>, hidden_);
+    } else {
+      hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_);
+    }
 
     if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
       Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
@@ -68,7 +78,8 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) {
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   auto dataType = y.getDataType();
-  Tensor ret = Tensor("ret", y.getFormat(), y.getDataType());
+
+  Tensor ret(y.getDim());
   if (dataType == ml::train::TensorDim::DataType::FP32) {
     y.apply(ActiFunc::softmax<float>, ret);
   } else if (dataType == ml::train::TensorDim::DataType::FP16) {
@@ -83,7 +94,18 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) {
   /// operation
   // TODO: verify y and ret_derivative must not be same as loss layer is not
   // working in-place
-  ret.subtract(y2, ret_derivative);
+  if (ret.getDataType() != y2.getDataType()) {
+    ret.subtract(y2.clone(ret.getDataType()), ret_derivative);
+  } else {
+    ret.subtract(y2, ret_derivative);
+  }
+
+  /**
+   * loss scale is applied for mixed precision
+   * every loss layers need to specify this applying code.
+   */
+  applyLossScale(ret_derivative);
+
   if (ret_derivative.divide_i(ret.batch()) != ML_ERROR_NONE) {
     throw std::runtime_error("[CrossEntropySoftmaxLossLayer::calcDerivative] "
                              "Error when calculating loss");
diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp
index ab2ccf8be2..422037b9e9 100644
--- a/nntrainer/layers/loss/loss_layer.cpp
+++ b/nntrainer/layers/loss/loss_layer.cpp
@@ -15,6 +15,9 @@
 #include <loss_layer.h>
 
 namespace nntrainer {
+
+LossLayer::LossLayer() : Layer(), loss_scale(0.0f) {}
+
 void LossLayer::finalize(InitLayerContext &context) {
   std::vector<TensorDim> input_dim = context.getInputDimensions();
   std::vector<TensorDim> output_dim = input_dim;
diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h
index 581e9477a8..84a1112864 100644
--- a/nntrainer/layers/loss/loss_layer.h
+++ b/nntrainer/layers/loss/loss_layer.h
@@ -27,6 +27,11 @@ namespace nntrainer {
  */
 class LossLayer : public Layer {
 public:
+  /**
+   * @brief     Constructor of Loss Layer
+   */
+  LossLayer();
+
   /**
    * @brief     Destructor of Loss Layer
    */
@@ -47,11 +52,19 @@ class LossLayer : public Layer {
    */
   virtual bool supportBackwarding() const override { return true; }
 
+  /**
+   * @brief Set loss scale factor
+   */
+  virtual void setLossScale(float scale) override { loss_scale = scale; }
+
+private:
   /**
    * @copydoc Layer::requireLabel()
    */
   bool requireLabel() const override { return true; }
 
+  float loss_scale; /**< loss scale factor */
+
 protected:
   /**
    * @brief     update loss
diff --git a/nntrainer/layers/loss/meson.build b/nntrainer/layers/loss/meson.build
index 9fccd0290d..8ec9928101 100644
--- a/nntrainer/layers/loss/meson.build
+++ b/nntrainer/layers/loss/meson.build
@@ -7,7 +7,9 @@ loss_layer_sources = [
   'constant_derivative_loss_layer.cpp'
 ]
 
-loss_layer_headers = []
+loss_layer_headers = [
+  'loss_layer.h'
+]
 
 loss_layer_deps = []
 
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index ed4390655d..e6b6f068f6 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -11,6 +11,7 @@
  *
  */
 
+#include "tensor.h"
 #include <layer_context.h>
 #include <mse_loss_layer.h>
 
@@ -32,21 +33,39 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
           .clone(ml::train::TensorDim::DataType::FP32);
 
   // hidden_ <- y2 - y;
-  if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
-    Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
-    y2.subtract(y, hidden_);
-
-    /** calculate sum of squares normalized by size */
-    float l2norm = hidden_.l2norm();
-    l2norm *= l2norm / hidden_.size();
-
-    /** wrap in tensor for update loss */
-    Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
-    LossLayer::updateLoss(context, l);
+  auto out_type = hidden_.getDataType();
+  if (out_type != y_.getDataType()) {
+    Tensor y = y_.clone(out_type);
+    if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
+      Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
+      y2.subtract(y, hidden_);
+
+      /** calculate sum of squares normalized by size */
+      float l2norm = hidden_.l2norm();
+      l2norm *= l2norm / hidden_.size();
+
+      /** wrap in tensor for update loss */
+      Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
+      LossLayer::updateLoss(context, l);
+    }
+    // fill the output
+    hidden_.fill(y);
+  } else {
+    if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
+      Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
+      y2.subtract(y_, hidden_);
+
+      /** calculate sum of squares normalized by size */
+      float l2norm = hidden_.l2norm();
+      l2norm *= l2norm / hidden_.size();
+
+      /** wrap in tensor for update loss */
+      Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
+      LossLayer::updateLoss(context, l);
+    }
+    // fill the output
+    hidden_.fill(y_);
   }
-
-  // fill the output
-  hidden_.fill(y);
 }
 
 void MSELossLayer::calcDerivative(RunLayerContext &context) {
@@ -73,9 +92,33 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
 
   const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
 
-  y.subtract(y2, ret_derivative);
+  const auto &in_type = y.getDataType();
+  if (in_type != y2.getDataType()) {
+    Tensor y2_ = y2.clone(in_type);
+    y.subtract(y2_, ret_derivative);
+  } else {
+    y.subtract(y2, ret_derivative);
+  }
+
+  applyLossScale(ret_derivative);
+
   float divider = ((float)y.size()) / 2;
-  if (ret_derivative.divide_i(divider) != ML_ERROR_NONE) {
+
+  /**
+   * ret_derivative may be eliminated by big divider with fp16 calculation.
+   * So, it calcuated with larger precision.
+   */
+  int ret;
+  if (ret_derivative.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    Tensor ret_derivative_ =
+      ret_derivative.clone(ml::train::TensorDim::DataType::FP32);
+    ret = ret_derivative_.divide_i(divider);
+    ret_derivative.copyData(ret_derivative_);
+  } else {
+    ret = ret_derivative.divide_i(divider);
+  }
+
+  if (ret != ML_ERROR_NONE) {
     throw std::runtime_error(
       "[MSELossLayer::calcDerivative] Error when calculating loss");
   }
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index d5f13a1fc5..be313a0aca 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -509,21 +509,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
   }
 
   // hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
-  const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
-                                   weight_tensor_type);
+  TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
+                             weight_tensor_type);
+  hidden_state_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::hidden_state] = context.requestTensor(
     hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   // cell_state_dim : [ batch_size, 1, max_timestep, unit ]
-  const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
-                                 weight_tensor_type);
+  TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
+                           weight_tensor_type);
+  cell_state_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::cell_state] = context.requestTensor(
     cell_state_dim, "cell_state", Tensor::Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   // ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
-  const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
-                           weight_tensor_type);
+  TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+                     weight_tensor_type);
+  ifgo_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::ifgo] =
     context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
@@ -576,21 +582,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
     }
 
     // reverse_hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
-    const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
-                                             weight_tensor_type);
+    TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
+                                       weight_tensor_type);
+    reverse_hidden_state_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor(
       reverse_hidden_state_dim, "reverse_hidden_state",
       Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN);
     // reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ]
-    const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
-                                           weight_tensor_type);
+    TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
+                                     weight_tensor_type);
+    reverse_cell_state_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor(
       reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE,
       true, TensorLifespan::ITERATION_LIFESPAN);
 
     // reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
-    const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep,
-                                     NUM_GATE * unit, weight_tensor_type);
+    TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+                               weight_tensor_type);
+    reverse_ifgo_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor(
       reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true,
       TensorLifespan::ITERATION_LIFESPAN);
@@ -598,8 +610,10 @@ void LSTMLayer::finalize(InitLayerContext &context) {
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch, 1, time_iteration, unit ]
-    const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
-                                     weight_tensor_type);
+    TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
+                               weight_tensor_type);
+    dropout_mask_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::dropout_mask] = context.requestTensor(
       dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
       TensorLifespan::ITERATION_LIFESPAN);
diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h
index f35fdf8815..a9b2cac7d7 100644
--- a/nntrainer/layers/lstm.h
+++ b/nntrainer/layers/lstm.h
@@ -99,7 +99,6 @@ class LSTMLayer : public LSTMCore {
 
   inline static const std::string type = "lstm";
 
-private:
   static constexpr unsigned int NUM_GATE = 4;
 
   /** common properties like Unit, IntegrateBias, HiddenStateActivation and
diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp
index a68e42e8d0..b53ca354f2 100644
--- a/nntrainer/layers/pooling2d_layer.cpp
+++ b/nntrainer/layers/pooling2d_layer.cpp
@@ -6,6 +6,7 @@
  * @date   12 June 2020
  * @see    https://github.com/nnstreamer/nntrainer
  * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @author Jiho Chu <jiho.chu@samsung.com>
  * @bug    No known bugs except for NYI items
  * @brief  This is 2 Dimensional Pooling Layer Class for Neural Network
  *
@@ -26,6 +27,13 @@ namespace nntrainer {
 
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
+/**
+ * @brief help function for Pooling handler
+ */
+template <typename T> struct PoolFunc {
+  typedef std::function<T(const T *, int, int, int)> Type;
+};
+
 Pooling2DLayer::Pooling2DLayer(
   const std::array<unsigned int, POOLING2D_DIM * 2> &padding_) :
   Layer(),
@@ -96,6 +104,7 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
   out_dim.channel(in_dim.channel());
   out_dim.height((eff_in_height - pool_size[0]) / stride[0] + 1);
   out_dim.width((eff_in_width - pool_size[1]) / stride[1] + 1);
+  out_dim.setDataType(in_dim.getDataType());
   context.setOutputDimensions({out_dim});
 
   /**
@@ -111,13 +120,17 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
    * // clang-format on
    */
   if (pooling_type == props::PoolingTypeInfo::Enum::global_max) {
+    auto helper_dim = in_dim;
+    helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     pool_helper_idx =
-      context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE,
+      context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
                             false, TensorLifespan::ITERATION_LIFESPAN);
-    pool_helper_size.resize(in_dim.batch() * in_dim.channel());
+    pool_helper_size.resize(helper_dim.batch() * helper_dim.channel());
   } else {
+    auto helper_dim = out_dim;
+    helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     pool_helper_idx =
-      context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE,
+      context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
                             false, TensorLifespan::ITERATION_LIFESPAN);
   }
 }
@@ -172,15 +185,13 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
   unsigned int J, K;
 
   result.setZero();
-  float *result_data = result.getData();
 
   unsigned int out_map_size = deriv.height() * deriv.width();
   unsigned int in_map_size = height * width;
 
-  switch (pooling_type) {
-  case props::PoolingTypeInfo::Enum::max: {
+  auto apply_max = [&]<typename T>(T *result_data) {
     const int *iter = pool_helper.getData<int>();
-    const float *deriv_data = deriv.getData();
+    const T *deriv_data = deriv.getData<T>();
     for (unsigned int b = 0; b < batch; ++b) {
       for (unsigned int c = 0; c < channel; ++c) {
         for (unsigned int i = 0; i < out_map_size; ++i) {
@@ -195,9 +206,9 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         result_data += in_map_size;
       }
     }
-  } break;
-  case props::PoolingTypeInfo::Enum::global_average:
-  case props::PoolingTypeInfo::Enum::average: {
+  };
+
+  auto apply_average = [&]<typename T>(T *result_data) {
     int height_stride_end = height - p_height + pt;
     int width_stride_end = width - p_width + pl;
     const int *iter = pool_helper.getData<int>();
@@ -207,7 +218,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         for (int j = -pt; j <= height_stride_end; j += stride[0]) {
           K = 0;
           for (int k = -pl; k <= width_stride_end; k += stride[1]) {
-            float del = deriv.getValue<float>(b, i, J, K) / *iter;
+            T del = deriv.getValue<T>(b, i, J, K) / *iter;
             int patch_height_end =
               std::min(static_cast<int>(j + p_height), height);
             int patch_width_end =
@@ -217,7 +228,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
             for (int h = start_h; h < patch_height_end; ++h) {
               for (int w = start_w; w < patch_width_end; ++w) {
                 result.setValue(b, i, h, w,
-                                result.getValue<float>(b, i, h, w) + del);
+                                result.getValue<T>(b, i, h, w) + del);
               }
             }
             iter++;
@@ -227,15 +238,16 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         }
       }
     }
-  } break;
-  case props::PoolingTypeInfo::Enum::global_max: {
-    const float *deriv_data = deriv.getData();
+  };
+
+  auto apply_global_max = [&]<typename T>(T *result_data) {
+    const T *deriv_data = deriv.getData<T>();
     for (unsigned int b = 0; b < batch; b++) {
       for (unsigned int c = 0; c < channel; c++) {
         const int *iter =
           pool_helper.getData<int>() + pool_helper.getIndex(b, c, 0, 0);
         unsigned int helper_size = pool_helper_size[b * channel + c];
-        float der = *deriv_data / helper_size;
+        T der = *deriv_data / static_cast<T>(helper_size);
 
         for (unsigned int idx = 0; idx < helper_size; idx++)
           result_data[iter[idx]] += der;
@@ -244,7 +256,40 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         result_data += in_map_size;
       }
     }
-  } break;
+  };
+
+  switch (pooling_type) {
+  case props::PoolingTypeInfo::Enum::max:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_max(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_max(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
+  case props::PoolingTypeInfo::Enum::global_average:
+  case props::PoolingTypeInfo::Enum::average:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_average(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_average(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
+  case props::PoolingTypeInfo::Enum::global_max:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_global_max(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_global_max(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
   default:
     throw std::runtime_error("Error: Unknown Pooling Type");
   }
@@ -290,124 +335,167 @@ void Pooling2DLayer::pooling2d(Tensor &in, bool training, Tensor &output,
    * @param start_w (width index pointing the start of the patch)
    * @return result value of pooling
    */
-  std::function<float(const float *, int, int, int)> pool_fn;
+  PoolFunc<float>::Type pool_fn_fp32;
+#ifdef ENABLE_FP16
+  PoolFunc<_FP16>::Type pool_fn_fp16;
+#endif
 
   unsigned int max_idx_count = 0;
-  switch (pooling_type) {
-  case props::PoolingTypeInfo::Enum::max: {
-    pool_fn = [&](const float *in_data, int channel_idx, int start_h,
-                  int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-
-      float max_val = std::numeric_limits<float>::lowest();
-
-      int cur_max_idx = -1;
-      int eff_end_h = std::min(end_h, in_height);
-      int eff_end_w = std::min(end_w, in_width);
-      start_w = std::max(0, start_w);
-      for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
-        for (int w = start_w; w < eff_end_w; ++w) {
-          int cur_idx = h * in_width + w;
-          float val = in_data[cur_idx];
-          if (max_val < val) {
-            max_val = val;
-            if (training) {
-              cur_max_idx = cur_idx;
-            }
+
+  auto pool_fn_max = [&]<typename T>(const T *in_data, int channel_idx,
+                                     int start_h, int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+
+    T max_val = std::numeric_limits<T>::lowest();
+
+    int cur_max_idx = -1;
+    int eff_end_h = std::min(end_h, in_height);
+    int eff_end_w = std::min(end_w, in_width);
+    start_w = std::max(0, start_w);
+    for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
+      for (int w = start_w; w < eff_end_w; ++w) {
+        int cur_idx = h * in_width + w;
+        T val = in_data[cur_idx];
+        if (max_val < val) {
+          max_val = val;
+          if (training) {
+            cur_max_idx = cur_idx;
           }
         }
       }
+    }
 
-      if (training) {
-        pool_helper.setValueInt(max_idx_count++, cur_max_idx);
-      }
+    if (training) {
+      pool_helper.setValueInt(max_idx_count++, cur_max_idx);
+    }
 
-      return max_val;
-    };
-    break;
-  }
-  case props::PoolingTypeInfo::Enum::global_max: {
-    pool_fn = [&, this](const float *in_data, int channel_idx, int start_h,
-                        int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-
-      float max_val = std::numeric_limits<float>::lowest();
-      int *helper_data = pool_helper.getData<int>();
-      helper_data += channel_idx * in_height * in_width;
-
-      for (int h = start_h; h < end_h; ++h) {
-        for (int w = start_w; w < end_w; ++w) {
-          int cur_idx = h * in_width + w;
-          float val = in_data[cur_idx];
-          if (max_val < val) {
-            max_val = val;
-            max_idx_count = 0;
-          }
+    return max_val;
+  };
 
-          if (training && max_val == val) {
-            *(helper_data + max_idx_count++) = cur_idx;
-          }
+  auto pool_fn_global_max = [&, this]<typename T>(const T *in_data,
+                                                  int channel_idx, int start_h,
+                                                  int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+
+    T max_val = std::numeric_limits<T>::lowest();
+    int *helper_data = pool_helper.getData<int>();
+    helper_data += channel_idx * in_height * in_width;
+
+    for (int h = start_h; h < end_h; ++h) {
+      for (int w = start_w; w < end_w; ++w) {
+        int cur_idx = h * in_width + w;
+        T val = in_data[cur_idx];
+        if (max_val < val) {
+          max_val = val;
+          max_idx_count = 0;
         }
-      }
 
-      pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
-      return max_val;
-    };
-    break;
-  }
-  case props::PoolingTypeInfo::Enum::global_average:
-  case props::PoolingTypeInfo::Enum::average: {
-    pool_fn = [&](const float *in_data, int channel_idx, int start_h,
-                  int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-      float total = 0.0f;
-
-      int eff_end_h = std::min(end_h, in_height);
-      int eff_end_w = std::min(end_w, in_width);
-      int eff_start_h = std::max(0, start_h);
-      int eff_start_w = std::max(0, start_w);
-
-      int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
-      for (int h = eff_start_h; h < eff_end_h; ++h) {
-        for (int w = eff_start_w; w < eff_end_w; ++w) {
-          float val = in_data[h * in_width + w];
-          total += val;
+        if (training && max_val == val) {
+          *(helper_data + max_idx_count++) = cur_idx;
         }
       }
+    }
 
-      if (training) {
-        pool_helper.setValueInt(max_idx_count++, cnt);
+    pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
+    return max_val;
+  };
+
+  auto pool_fn_average = [&]<typename T>(const T *in_data, int channel_idx,
+                                         int start_h, int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+    T total = static_cast<T>(0.0f);
+
+    int eff_end_h = std::min(end_h, in_height);
+    int eff_end_w = std::min(end_w, in_width);
+    int eff_start_h = std::max(0, start_h);
+    int eff_start_w = std::max(0, start_w);
+
+    int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
+    for (int h = eff_start_h; h < eff_end_h; ++h) {
+      for (int w = eff_start_w; w < eff_end_w; ++w) {
+        T val = in_data[h * in_width + w];
+        total += val;
       }
-      return total / cnt;
-    };
+    }
+
+    if (training) {
+      pool_helper.setValueInt(max_idx_count++, cnt);
+    }
+    return total / cnt;
+  };
+
+  switch (pooling_type) {
+  case props::PoolingTypeInfo::Enum::max:
+    pool_fn_fp32 = pool_fn_max;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_max;
+#endif
+    break;
+  case props::PoolingTypeInfo::Enum::global_max:
+    pool_fn_fp32 = pool_fn_global_max;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_global_max;
+#endif
+    break;
+  case props::PoolingTypeInfo::Enum::global_average:
+  case props::PoolingTypeInfo::Enum::average:
+    pool_fn_fp32 = pool_fn_average;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_average;
+#endif
     break;
-  }
   case props::PoolingTypeInfo::Enum::unknown:
   default:
     throw std::invalid_argument("unknown pooling type given");
     break;
   }
 
-  const float *in_data = in.getData();
-  float *out_data = output.getData();
-
-  unsigned int map_size = in_height * in_width;
-
-  int height_stride_end = height - patch_height - pt;
-  int width_stride_end = width - patch_width - pl;
-  for (unsigned int i = 0; i < channel; ++i) {
-    const float *in_data_channel_sliced = in_data + i * map_size;
-    for (int j = -pt; j <= height_stride_end; j += stride[0]) {
-      for (int k = -pl; k <= width_stride_end; k += stride[1]) {
-        float pool_value = pool_fn(in_data_channel_sliced, i, j, k);
-        *out_data = pool_value;
-        out_data++;
+  if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    const float *in_data = in.getData<float>();
+    float *out_data = output.getData<float>();
+
+    unsigned int map_size = in_height * in_width;
+
+    int height_stride_end = height - patch_height - pt;
+    int width_stride_end = width - patch_width - pl;
+    for (unsigned int i = 0; i < channel; ++i) {
+      const float *in_data_channel_sliced = in_data + i * map_size;
+      for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+        for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+          float pool_value = pool_fn_fp32(in_data_channel_sliced, i, j, k);
+          *out_data = pool_value;
+          out_data++;
+        }
+      }
+    }
+  }
+#ifdef ENABLE_FP16
+  else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+    const _FP16 *in_data = in.getData<_FP16>();
+    _FP16 *out_data = output.getData<_FP16>();
+
+    unsigned int map_size = in_height * in_width;
+
+    int height_stride_end = height - patch_height - pt;
+    int width_stride_end = width - patch_width - pl;
+    for (unsigned int i = 0; i < channel; ++i) {
+      const _FP16 *in_data_channel_sliced = in_data + i * map_size;
+      for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+        for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+          _FP16 pool_value = pool_fn_fp16(in_data_channel_sliced, i, j, k);
+          *out_data = pool_value;
+          out_data++;
+        }
       }
     }
   }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
+  }
 }
 
 void Pooling2DLayer::setBatch(RunLayerContext &context, unsigned int batch) {
diff --git a/nntrainer/layers/reshape_layer.cpp b/nntrainer/layers/reshape_layer.cpp
index 0f82d84f3a..07564b3970 100644
--- a/nntrainer/layers/reshape_layer.cpp
+++ b/nntrainer/layers/reshape_layer.cpp
@@ -42,6 +42,7 @@ void ReshapeLayer::finalize(InitLayerContext &context) {
   }
 
   out_dim.batch(in_dim.batch());
+  out_dim.setDataType(in_dim.getDataType());
 
   context.setOutputDimensions({out_dim});
 }

From 40a523378b7a83239ac90c337c0319a181558440 Mon Sep 17 00:00:00 2001
From: Donghak PARK <donghak.park@samsung.com>
Date: Tue, 7 May 2024 16:27:54 +0900
Subject: [PATCH 13/14] [Mixed] Mixed Precision Layer update

This PR is to update the mixed precision layer.
- integrate #2568 & #2455
- will update more test

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghak PARK <donghak.park@samsung.com>
---
 nntrainer/layers/layer_context.cpp            | 51 -------------
 nntrainer/layers/layer_context.h              | 38 ----------
 nntrainer/layers/layer_node.cpp               | 27 +------
 nntrainer/layers/layer_node.h                 |  5 --
 .../loss/cross_entropy_sigmoid_loss_layer.cpp |  3 -
 .../loss/cross_entropy_softmax_loss_layer.cpp | 36 ++-------
 nntrainer/layers/loss/loss_layer.cpp          |  3 -
 nntrainer/layers/loss/loss_layer.h            | 13 ----
 nntrainer/layers/loss/meson.build             |  4 +-
 nntrainer/layers/loss/mse_loss_layer.cpp      | 75 ++++---------------
 nntrainer/optimizers/adam.cpp                 | 56 +++-----------
 nntrainer/optimizers/optimizer_context.cpp    | 15 ----
 nntrainer/optimizers/optimizer_context.h      | 15 ----
 13 files changed, 39 insertions(+), 302 deletions(-)

diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index 648eae30ee..5862e6af14 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -157,16 +157,6 @@ Tensor &RunLayerContext::getWeight(unsigned int idx) const {
   return weights[idx]->getVariableRef();
 }
 
-/**
- * @brief Get the Weight tensor object
- *
- * @param idx Identifier of the weight
- * @return Tensor& Reference to the weight tensor
- */
-Tensor *RunLayerContext::getWeightMaster(unsigned int idx) const {
-  return weights[idx]->getVariableMasterRef();
-}
-
 /**
  * @brief Get the Weight Gradient tensor object
  *
@@ -205,18 +195,6 @@ Tensor &RunLayerContext::getWeightOptVar(unsigned int idx,
   return weights[idx]->getOptimizerVariableRef(jdx);
 }
 
-/**
- * @brief Get the Weight Optimizer Variable tensor object
- *
- * @param idx Identifier of the weight
- * @param jdx Identifier of the optimizer variables
- * @return Tensor& Reference to the weight optimizer variable tensor
- */
-Tensor &RunLayerContext::getWeightOptMasterVar(unsigned int idx,
-                                               unsigned int jdx) const {
-  return weights[idx]->getOptimizerMasterVariableRef(jdx);
-}
-
 /**
  * @brief Get the Number of Weight Optimizer Variable tensor object
  *
@@ -227,16 +205,6 @@ unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const {
   return weights[idx]->getNumOptVariable();
 }
 
-/**
- * @brief Get the Number of Weight Optimizer Variable tensor object
- *
- * @param idx Identifier of the weight
- * @return int Number of the weight optimizer variable
- */
-unsigned int RunLayerContext::getNumWeightOptMasterVar(unsigned int idx) const {
-  return weights[idx]->getNumOptMasterVariable();
-}
-
 /**
  * @brief Get regularization loss for the weight
  *
@@ -376,25 +344,6 @@ Tensor &RunLayerContext::getOutgoingDerivative(unsigned int idx) {
   return getInputGrad(idx);
 }
 
-bool RunLayerContext::validateDerivatives() {
-  auto num_in = getNumInputs();
-  auto num_out = getNumOutputs();
-
-  for (unsigned int i = 0; i < num_in; ++i) {
-    auto deriv = getIncomingDerivative(i);
-    if (deriv.checkDataValidation(false) == false)
-      return false;
-  }
-
-  for (unsigned int i = 0; i < num_out; ++i) {
-    auto deriv = getOutgoingDerivative(i);
-    if (deriv.checkDataValidation(false) == false)
-      return false;
-  }
-
-  return true;
-}
-
 /**
  * @brief Get the Tensor object
  *
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index 3e2e3d0339..c68c42f11d 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -474,14 +474,6 @@ class RunLayerContext {
    */
   Tensor &getWeight(unsigned int idx) const;
 
-  /**
-   * @brief Get the Weight master tensor object
-   *
-   * @param idx Identifier of the weight
-   * @return Tensor& Reference to the weight tensor
-   */
-  Tensor *getWeightMaster(unsigned int idx) const;
-
   /**
    * @brief Get the Weight Gradient tensor object
    *
@@ -509,15 +501,6 @@ class RunLayerContext {
    */
   Tensor &getWeightOptVar(unsigned int idx, unsigned int jdx) const;
 
-  /**
-   * @brief Get the Weight Optimizer Master Variable tensor object
-   *
-   * @param idx Identifier of the weight
-   * @param jdx Identifier of the weight optimizer master variable
-   * @return Tensor& Reference to the weight optimizer tensor
-   */
-  Tensor &getWeightOptMasterVar(unsigned int idx, unsigned int jdx) const;
-
   /**
    * @brief Get the Weight name
    *
@@ -628,11 +611,6 @@ class RunLayerContext {
    */
   Tensor &getOutgoingDerivative(unsigned int idx);
 
-  /**
-   * @brief validate input/output derivatives of the layer
-   */
-  bool validateDerivatives();
-
   /**
    * @brief Get the Tensor object
    *
@@ -762,14 +740,6 @@ class RunLayerContext {
    */
   unsigned int getNumWeightOptVar(unsigned int idx) const;
 
-  /**
-   * @brief Get the Number of Weight Optimizer Variable tensor object
-   *
-   * @param idx Identifier of the weight
-   * @return unsigned int Number of the weight optimizer variable
-   */
-  unsigned int getNumWeightOptMasterVar(unsigned int idx) const;
-
   /**
    * @brief Get the number of requested tensors objects
    *
@@ -777,14 +747,6 @@ class RunLayerContext {
    */
   unsigned int getNumTensors() const { return tensors.size(); }
 
-  /**
-   * @brief Set the Weight Optimizer Variable tensor object
-   *
-   * @param idx Identifier of the weight
-   * @param jdx Identifier of the weight optimizer variable
-   */
-  void setWeightOptVars(unsigned int idx, std::vector<Tensor *> opts);
-
   /**
    * @brief Set the batch for the run context
    *
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index 6eb4b279de..f41752a4d8 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -16,7 +16,6 @@
 #include <cmath>
 #include <iterator>
 #include <stdexcept>
-#include <tuple>
 #include <utility>
 
 #include <activation_layer.h>
@@ -466,12 +465,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
         /// @note read optimizer variables
-        auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i);
         for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
-          if (num_w_opt_m > 0)
-            run_context->getWeightOptMasterVar(i, j).read(file);
-          else
-            run_context->getWeightOptVar(i, j).read(file);
+          run_context->getWeightOptVar(i, j).read(file);
         }
       }
     }
@@ -479,11 +474,7 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       /// @note shared weights are only be read at the first acecss
       if (run_context->isGradientLastAccess(i)) {
-        auto w = run_context->getWeightMaster(i);
-        if (w)
-          w->read(file);
-        else
-          run_context->getWeight(i).read(file);
+        run_context->getWeight(i).read(file);
       }
     }
   }
@@ -498,13 +489,9 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
         // @note save optimizer variables
         if (run_context->weightHasGradient(i)) {
-          auto num_w_opt_m = run_context->getNumWeightOptMasterVar(i);
           for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
                ++j) {
-            if (num_w_opt_m > 0)
-              run_context->getWeightOptMasterVar(i, j).save(file);
-            else
-              run_context->getWeightOptVar(i, j).save(file);
+            run_context->getWeightOptVar(i, j).save(file);
           }
         }
       }
@@ -513,13 +500,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
     // @note shared weights are only be saved at the first access
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i)) {
-        if (run_context->getNumWeights()) {
-          auto w = run_context->getWeightMaster(i);
-          if (w)
-            w->save(file);
-          else
-            run_context->getWeight(i).save(file);
-        }
+        run_context->getWeight(i).save(file);
       }
     }
   }
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 6be31f536e..3fd2d55b97 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -900,11 +900,6 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
    */
   bool needsCalcGradient() { return needs_calc_gradient; }
 
-  /**
-   * @brief Set loss scale factor
-   */
-  void setLossScale(float scale) { layer->setLossScale(scale); }
-
 private:
   /**
    * @brief     Get the Input Layers object
diff --git a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
index feeff2b3d8..60ea113418 100644
--- a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
+++ b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp
@@ -61,9 +61,6 @@ void CrossEntropySigmoidLossLayer::calcDerivative(RunLayerContext &context) {
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   y.apply<float>(ActiFunc::sigmoid<float>, ret_derivative);
-
-  applyLossScale(ret_derivative);
-
   ret_derivative.subtract_i(y2);
   if (ret_derivative.divide_i(ret_derivative.size()) != ML_ERROR_NONE) {
     throw std::runtime_error("[CrossEntropySigmoidLossLayer::calcDerivative] "
diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
index c181c60b9a..53854662ae 100644
--- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
+++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp
@@ -30,14 +30,9 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context,
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   // fill the output
-  auto out_type = hidden_.getDataType();
-  if (out_type == ml::train::TensorDim::DataType::FP32) {
-    if (y.getDataType() != out_type) {
-      Tensor y_ = y.clone(out_type);
-      hidden_ = y_.apply(ActiFunc::softmax<float>, hidden_);
-    } else {
-      hidden_ = y.apply(ActiFunc::softmax<float>, hidden_);
-    }
+  auto dataType = y.getDataType();
+  if (dataType == ml::train::TensorDim::DataType::FP32) {
+    hidden_ = y.apply(ActiFunc::softmax<float>, hidden_);
 
     if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
       Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
@@ -48,14 +43,9 @@ void CrossEntropySoftmaxLossLayer::forwarding(RunLayerContext &context,
       // update the loss value
       LossLayer::updateLoss(context, l);
     }
-  } else if (out_type == ml::train::TensorDim::DataType::FP16) {
+  } else if (dataType == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
-    if (y.getDataType() != out_type) {
-      Tensor y_ = y.clone(out_type);
-      hidden_ = y_.apply(ActiFunc::softmax<_FP16>, hidden_);
-    } else {
-      hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_);
-    }
+    hidden_ = y.apply(ActiFunc::softmax<_FP16>, hidden_);
 
     if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
       Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
@@ -78,8 +68,7 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) {
   Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   auto dataType = y.getDataType();
-
-  Tensor ret(y.getDim());
+  Tensor ret = Tensor("ret", y.getFormat(), y.getDataType());
   if (dataType == ml::train::TensorDim::DataType::FP32) {
     y.apply(ActiFunc::softmax<float>, ret);
   } else if (dataType == ml::train::TensorDim::DataType::FP16) {
@@ -94,18 +83,7 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) {
   /// operation
   // TODO: verify y and ret_derivative must not be same as loss layer is not
   // working in-place
-  if (ret.getDataType() != y2.getDataType()) {
-    ret.subtract(y2.clone(ret.getDataType()), ret_derivative);
-  } else {
-    ret.subtract(y2, ret_derivative);
-  }
-
-  /**
-   * loss scale is applied for mixed precision
-   * every loss layers need to specify this applying code.
-   */
-  applyLossScale(ret_derivative);
-
+  ret.subtract(y2, ret_derivative);
   if (ret_derivative.divide_i(ret.batch()) != ML_ERROR_NONE) {
     throw std::runtime_error("[CrossEntropySoftmaxLossLayer::calcDerivative] "
                              "Error when calculating loss");
diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp
index 422037b9e9..ab2ccf8be2 100644
--- a/nntrainer/layers/loss/loss_layer.cpp
+++ b/nntrainer/layers/loss/loss_layer.cpp
@@ -15,9 +15,6 @@
 #include <loss_layer.h>
 
 namespace nntrainer {
-
-LossLayer::LossLayer() : Layer(), loss_scale(0.0f) {}
-
 void LossLayer::finalize(InitLayerContext &context) {
   std::vector<TensorDim> input_dim = context.getInputDimensions();
   std::vector<TensorDim> output_dim = input_dim;
diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h
index 84a1112864..581e9477a8 100644
--- a/nntrainer/layers/loss/loss_layer.h
+++ b/nntrainer/layers/loss/loss_layer.h
@@ -27,11 +27,6 @@ namespace nntrainer {
  */
 class LossLayer : public Layer {
 public:
-  /**
-   * @brief     Constructor of Loss Layer
-   */
-  LossLayer();
-
   /**
    * @brief     Destructor of Loss Layer
    */
@@ -52,19 +47,11 @@ class LossLayer : public Layer {
    */
   virtual bool supportBackwarding() const override { return true; }
 
-  /**
-   * @brief Set loss scale factor
-   */
-  virtual void setLossScale(float scale) override { loss_scale = scale; }
-
-private:
   /**
    * @copydoc Layer::requireLabel()
    */
   bool requireLabel() const override { return true; }
 
-  float loss_scale; /**< loss scale factor */
-
 protected:
   /**
    * @brief     update loss
diff --git a/nntrainer/layers/loss/meson.build b/nntrainer/layers/loss/meson.build
index 8ec9928101..9fccd0290d 100644
--- a/nntrainer/layers/loss/meson.build
+++ b/nntrainer/layers/loss/meson.build
@@ -7,9 +7,7 @@ loss_layer_sources = [
   'constant_derivative_loss_layer.cpp'
 ]
 
-loss_layer_headers = [
-  'loss_layer.h'
-]
+loss_layer_headers = []
 
 loss_layer_deps = []
 
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index e6b6f068f6..ed4390655d 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -11,7 +11,6 @@
  *
  */
 
-#include "tensor.h"
 #include <layer_context.h>
 #include <mse_loss_layer.h>
 
@@ -33,39 +32,21 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
           .clone(ml::train::TensorDim::DataType::FP32);
 
   // hidden_ <- y2 - y;
-  auto out_type = hidden_.getDataType();
-  if (out_type != y_.getDataType()) {
-    Tensor y = y_.clone(out_type);
-    if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
-      Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
-      y2.subtract(y, hidden_);
-
-      /** calculate sum of squares normalized by size */
-      float l2norm = hidden_.l2norm();
-      l2norm *= l2norm / hidden_.size();
-
-      /** wrap in tensor for update loss */
-      Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
-      LossLayer::updateLoss(context, l);
-    }
-    // fill the output
-    hidden_.fill(y);
-  } else {
-    if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
-      Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
-      y2.subtract(y_, hidden_);
-
-      /** calculate sum of squares normalized by size */
-      float l2norm = hidden_.l2norm();
-      l2norm *= l2norm / hidden_.size();
-
-      /** wrap in tensor for update loss */
-      Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
-      LossLayer::updateLoss(context, l);
-    }
-    // fill the output
-    hidden_.fill(y_);
+  if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
+    Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX);
+    y2.subtract(y, hidden_);
+
+    /** calculate sum of squares normalized by size */
+    float l2norm = hidden_.l2norm();
+    l2norm *= l2norm / hidden_.size();
+
+    /** wrap in tensor for update loss */
+    Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm);
+    LossLayer::updateLoss(context, l);
   }
+
+  // fill the output
+  hidden_.fill(y);
 }
 
 void MSELossLayer::calcDerivative(RunLayerContext &context) {
@@ -92,33 +73,9 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
 
   const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
 
-  const auto &in_type = y.getDataType();
-  if (in_type != y2.getDataType()) {
-    Tensor y2_ = y2.clone(in_type);
-    y.subtract(y2_, ret_derivative);
-  } else {
-    y.subtract(y2, ret_derivative);
-  }
-
-  applyLossScale(ret_derivative);
-
+  y.subtract(y2, ret_derivative);
   float divider = ((float)y.size()) / 2;
-
-  /**
-   * ret_derivative may be eliminated by big divider with fp16 calculation.
-   * So, it calcuated with larger precision.
-   */
-  int ret;
-  if (ret_derivative.getDataType() != ml::train::TensorDim::DataType::FP32) {
-    Tensor ret_derivative_ =
-      ret_derivative.clone(ml::train::TensorDim::DataType::FP32);
-    ret = ret_derivative_.divide_i(divider);
-    ret_derivative.copyData(ret_derivative_);
-  } else {
-    ret = ret_derivative.divide_i(divider);
-  }
-
-  if (ret != ML_ERROR_NONE) {
+  if (ret_derivative.divide_i(divider) != ML_ERROR_NONE) {
     throw std::runtime_error(
       "[MSELossLayer::calcDerivative] Error when calculating loss");
   }
diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
index e2d611b7fd..530e7fdf31 100644
--- a/nntrainer/optimizers/adam.cpp
+++ b/nntrainer/optimizers/adam.cpp
@@ -95,60 +95,26 @@ void Adam::applyGradient(RunOptimizerContext &context) {
   Tensor &wm = context.getOptimizerVariable(AdamParams::wm);
   Tensor &wv = context.getOptimizerVariable(AdamParams::wv);
 
-  if (context.getNumOptMasterVariable() != 0) {
-    Tensor &wm_m = context.getOptimizerMasterVariable(AdamParams::wm);
-    Tensor &wv_m = context.getOptimizerMasterVariable(AdamParams::wv);
-    Tensor x_grad_ = x_grad.clone(wm_m.getDataType());
+  wm.multiply_i(beta1);
+  wm.add_i(x_grad, 1.0f - beta1);
 
-    wm_m.multiply_i(beta1);
-    wm_m.add_i(x_grad_, 1.0f - beta1);
-
-    wv_m.multiply_i(beta2);
-    wv_m.add_i(x_grad_.multiply(x_grad_), 1.0f - beta2);
-
-    wm.copyData(wm_m);
-    wv.copyData(wv_m);
-  } else {
-    wm.multiply_i(beta1);
-    wm.add_i(x_grad, 1.0f - beta1);
-
-    wv.multiply_i(beta2);
-    wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2);
-  }
+  wv.multiply_i(beta2);
+  wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2);
 
   if (torch_ref) {
-    if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      Tensor denom = wv.apply<float>(sqrtFloat<float>);
-      denom.divide_i(sqrtFloat(biasCorrection2));
-      denom.add_i(epsilon);
-      wm.divide(denom, x_grad);
-#ifdef ENABLE_FP16
-    } else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16) {
-      Tensor denom = wv.apply<_FP16>(sqrtFloat<_FP16>);
-      denom.divide_i(sqrtFloat(biasCorrection2));
-      denom.add_i(epsilon);
-      wm.divide(denom, x_grad);
-#endif
-    } else {
-      throw std::runtime_error("Not supported datatype");
-    }
+    Tensor denom = wv.apply<float>(sqrtFloat<float>);
+    denom.divide_i(sqrtFloat(biasCorrection2));
+    denom.add_i(epsilon);
+    wm.divide(denom, x_grad);
 
     context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
 
   } else {
-    auto sqrtEps = [epsilon]<typename T>(T f) -> T {
-      return 1 / (static_cast<T>(sqrtDouble(f)) + static_cast<T>(epsilon));
+    std::function<double(double)> sqrtEps = [epsilon](double f) {
+      return 1 / (sqrtDouble(f) + epsilon);
     };
 
-    if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP32)
-      x_grad = wv.apply<float>(sqrtEps, x_grad);
-#ifdef ENABLE_FP16
-    else if (x_grad.getDataType() == ml::train::TensorDim::DataType::FP16)
-      x_grad = wv.apply<_FP16>(sqrtEps, x_grad);
-#endif
-    else
-      throw std::runtime_error("Not supported datatype");
-
+    x_grad = wv.apply<float>(sqrtEps, x_grad);
     x_grad.multiply_i(wm);
     context.applyGradient(
       getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
index f0817c2a75..f70ab773a9 100644
--- a/nntrainer/optimizers/optimizer_context.cpp
+++ b/nntrainer/optimizers/optimizer_context.cpp
@@ -36,21 +36,6 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
   return weight->getOptimizerVariableRef(idx);
 }
 
-/**
- * @brief Get the optimizer variable associated to this weight
- */
-Tensor &
-RunOptimizerContext::getOptimizerMasterVariable(unsigned int idx) const {
-  return weight->getOptimizerMasterVariableRef(idx);
-}
-
-/**
- * @brief Get number of optimizer master variable
- */
-int RunOptimizerContext::getNumOptMasterVariable() {
-  return weight->getNumOptMasterVariable();
-}
-
 /**
  * @brief   Apply the gradient with the given learning rate
  */
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
index ca30c36b94..6b4b983e35 100644
--- a/nntrainer/optimizers/optimizer_context.h
+++ b/nntrainer/optimizers/optimizer_context.h
@@ -59,21 +59,6 @@ class RunOptimizerContext {
    */
   Tensor &getOptimizerVariable(unsigned int idx) const;
 
-  /**
-   * @brief Get the optimizer Master variable associated to this weight
-   *
-   * @param idx Identifier of the associated weight
-   * @return Tensor& Reference to the optimizer variable
-   */
-  Tensor &getOptimizerMasterVariable(unsigned int idx) const;
-
-  /**
-   * @brief Get number of the optimizer Master variable
-   *
-   * @return number of optimizer master variable
-   */
-  int getNumOptMasterVariable();
-
   /**
    * @brief   Check if run context is set and is ready to use
    *

From edd5778f9fc0e4188da3e5b4efd67ddced0fde3a Mon Sep 17 00:00:00 2001
From: Donghak PARK <donghak.park@samsung.com>
Date: Fri, 17 May 2024 18:35:25 +0900
Subject: [PATCH 14/14] [unittest] Add mixed precision unit test

Add 2 case of mixed precision unit test
- case1 : FC-FC-FC
- case2 : Flatten - FC

i will add more mixed precision case (conv, lstm, etc..)

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghak PARK <donghak.park@samsung.com>
---
 .../unittest_models_mixed_precision.cpp       | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp
index becf11ff44..afed06960a 100644
--- a/test/unittest/models/unittest_models_mixed_precision.cpp
+++ b/test/unittest/models/unittest_models_mixed_precision.cpp
@@ -41,11 +41,52 @@ static std::unique_ptr<NeuralNetwork> fc_mixed_training() {
   return nn;
 }
 
+static std::unique_ptr<NeuralNetwork> multi_fc_mixed_training() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty(
+    {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"});
+
+  auto graph = makeGraph({
+    {"input", {"name=in", "input_shape=1:28:28"}},
+    {"Fully_connected", {"name=fc1", "input_layers=in", "unit=100"}},
+    {"Fully_connected", {"name=fc2", "input_layers=fc1", "unit=50"}},
+    {"Fully_connected", {"name=fc3", "input_layers=fc2", "unit=10"}},
+    {"mse", {"name=loss", "input_layers=fc3"}},
+  });
+  for (auto &node : graph) {
+    nn->addLayer(node);
+  }
+  nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate=0.1"}));
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> flatten_fc_mixed_training() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty(
+    {"batch_size=2", "model_tensor_type=FP16-FP16", "loss_scale=128"});
+
+  auto graph = makeGraph({
+    {"input", {"name=in", "input_shape=1:28:28"}},
+    {"flatten", {"name=flatten", "input_layers=in"}},
+    {"Fully_connected", {"name=fc", "input_layers=flatten", "unit=10"}},
+    {"mse", {"name=loss", "input_layers=fc"}},
+  });
+  for (auto &node : graph) {
+    nn->addLayer(node);
+  }
+  nn->setOptimizer(ml::train::createOptimizer("adam", {"learning_rate=0.1"}));
+  return nn;
+}
+
 GTEST_PARAMETER_TEST(
   MixedPrecision, nntrainerModelTest,
   ::testing::ValuesIn({
     mkModelTc_V2(fc_mixed_training, "fc_mixed_training",
                  ModelTestOption::NO_THROW_RUN_V2),
+    mkModelTc_V2(multi_fc_mixed_training, "multi_fc_mixed_training",
+                 ModelTestOption::NO_THROW_RUN_V2),
+    mkModelTc_V2(flatten_fc_mixed_training, "flatten_fc_mixed_training",
+                 ModelTestOption::NO_THROW_RUN_V2),
     /** ModelTestOption::ALL_V2),
      * Disabled for now to check
      */