Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ Layer ] Mixed Precision support for BN Layer
Browse files Browse the repository at this point in the history
This PR includes Mixed Precision support for batch normalization
layer. When the training, BN layer should run full precsion with FP16
Weight data. Therefore, Reading the FP16 data read and data coversion
of the current Weight and Activation are required.

For the Inference, we do need compiler optimization like bn fusing. So
it also includes execution mode parameters for compile.

Because of compilcate data conversion of bn layer, test case
generation also needs to update, so that taking the fp16 input,output
tensors and weights and converting FP32 weight for computation.
For veification, we do need convert FP32 to FP16.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
jijoongmoon committed Jun 3, 2024
1 parent 9cb71dd commit a5b1545
Showing 10 changed files with 288 additions and 56 deletions.
6 changes: 4 additions & 2 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
@@ -938,6 +938,9 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
}
}

lnode->setDataType(init_context.getWeightDataType(),
init_context.getActivationDataType());

lnode->configureRunContext(
// TODO: update weights spec for trainable based on layer trainable prop
tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
@@ -1198,8 +1201,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
* Initialize all the layers, allocate output tensors for each layer
* init2and add optimizer related weights for the layer
*/
const std::vector<Var_Grad *> &outputs =
finalizeContext(lnode, inputs);
const std::vector<Var_Grad *> &outputs = finalizeContext(lnode, inputs);

/** no need to update input_map for the last layer */
if (idx == graph.size() - 1)
112 changes: 93 additions & 19 deletions nntrainer/layers/bn_layer.cpp
Original file line number Diff line number Diff line change
@@ -73,6 +73,10 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {

TensorDim dim(context.getFormat(), context.getWeightDataType());

if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
dim.setDataType(TensorDim::DataType::FP32);
}

/// @note this logic cannot tell channel is actually 1 or it is just not used.
auto &axis_prop = std::get<props::Axis>(bn_props);
unsigned int axis;
@@ -99,26 +103,32 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
}

wt_idx[BNParams::mu] =
context.requestWeight(dim, bnparams_mu, WeightRegularizer::NONE, 1.0f, 0.0f,
"moving_mean", false);
context.requestWeight(dim, dim, bnparams_mu, WeightRegularizer::NONE, 1.0f,
0.0f, "moving_mean", false);
wt_idx[BNParams::var] =
context.requestWeight(dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
context.requestWeight(dim, dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
0.0f, "moving_variance", false);
wt_idx[BNParams::gamma] =
context.requestWeight(dim, bnparams_gamma, WeightRegularizer::NONE, 1.0f,
weight_decay, "gamma", true);
context.requestWeight(dim, dim, bnparams_gamma, WeightRegularizer::NONE,
1.0f, weight_decay, "gamma", true);
wt_idx[BNParams::beta] =
context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
bias_decay, "beta", true);
context.requestWeight(dim, dim, bnparams_beta, WeightRegularizer::NONE,
1.0f, bias_decay, "beta", true);

/**
* caches the deviation -> input - avg(input)
* @todo check if avoiding this storage and adding dependency on input (no
* more in-place calculation) can save memory during memory optimization.
*/
TensorDim in_dim_ = in_dim;

if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
in_dim_.setDataType(TensorDim::DataType::FP32);
}

wt_idx[BNParams::deviation] =
context.requestTensor(in_dim, "deviation", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
context.requestTensor(in_dim_, "deviation", Tensor::Initializer::NONE,
false, TensorLifespan::ITERATION_LIFESPAN);
/** caches the inverse standard deviation */
wt_idx[BNParams::invstd] =
context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
@@ -130,7 +140,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
* as the output of this layer need not be stored all the time.
*/
wt_idx[BNParams::t_full] =
context.requestTensor(in_dim, "tensor_full", Tensor::Initializer::NONE,
context.requestTensor(in_dim_, "tensor_full", Tensor::Initializer::NONE,
false, TensorLifespan::CALC_DERIV_LIFESPAN);
/**
* caches variance + epsilon as well.
@@ -164,8 +174,32 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
Tensor &beta = context.getWeight(wt_idx[BNParams::beta]);

Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
Tensor em_input, em_hidden;

Tensor &input_ = em_input;
Tensor &hidden_ = em_hidden;

if (training) {
if (context.getInput(SINGLE_INOUT_IDX).getDataType() !=
TensorDim::DataType::FP32) {
input_ =
context.getInput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
} else {
input_ = context.getInput(SINGLE_INOUT_IDX);
}

if (context.getOutput(SINGLE_INOUT_IDX).getDataType() !=
TensorDim::DataType::FP32) {
hidden_ =
context.getOutput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
} else {
hidden_ = context.getOutput(SINGLE_INOUT_IDX);
}
} else {
input_ = context.getInput(SINGLE_INOUT_IDX);
hidden_ = context.getOutput(SINGLE_INOUT_IDX);
}

Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);

@@ -200,21 +234,46 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
deviation.multiply(invstd, hidden_);
hidden_.multiply_i(gamma);
hidden_.add_i(beta);

if (training && hidden_.getDataType() !=
context.getOutput(SINGLE_INOUT_IDX).getDataType())
context.getOutput(SINGLE_INOUT_IDX).copyData(hidden_);
}

void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {

Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX);

Tensor em_dx, deriv32;
bool deriv_copyed = false;

const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);

if (deriv.getDataType() != TensorDim::DataType::FP32) {
deriv_copyed = true;
TensorDim dim = deriv.getDim();
dim.setDataType(TensorDim::DataType::FP32);
deriv32 = Tensor(dim, true);
deriv32.copyData(deriv);
}

Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
TensorDim::DataType::FP32
? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
: em_dx;

if (dx.empty())
dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
.clone(TensorDim::DataType::FP32);

Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);
Tensor &cvar = context.getTensor(wt_idx[BNParams::cvar]);

Tensor &t_reduced = context.getTensor(wt_idx[BNParams::t_reduced]);
Tensor &t_full = context.getTensor(wt_idx[BNParams::t_full]);

deviation.multiply(deriv, t_full);
deviation.multiply((deriv_copyed ? deriv32 : deriv), t_full);
t_full.average(axes_to_reduce, t_reduced);
t_reduced.divide_i(cvar);
deviation.multiply_i(t_reduced);
@@ -233,22 +292,37 @@ void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {
Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
dbeta.divide(divider, t_reduced);
} else {
deriv.average(axes_to_reduce, t_reduced);
(deriv_copyed ? deriv32 : deriv).average(axes_to_reduce, t_reduced);
}

deriv.subtract(t_reduced, dx);
(deriv_copyed ? deriv32 : deriv).subtract(t_reduced, dx);
dx.subtract_i(deviation);

invstd.multiply_i(gamma);
dx.multiply_i(invstd);

if (dx.getDataType() !=
context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType())
context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(dx);
}

void BatchNormalizationLayer::calcGradient(RunLayerContext &context) {
/** dgamma is calculated in calcDerivative. dbeta is calculated here */
Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);

deriv.sum(axes_to_reduce, dbeta);
Tensor deriv32;
bool deriv_copyed = false;

const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
if (deriv.getDataType() != TensorDim::DataType::FP32) {
deriv_copyed = true;
TensorDim dim = deriv.getDim();
dim.setDataType(TensorDim::DataType::FP32);
deriv32 = Tensor(dim, true);
deriv32.copyData(deriv);
}

(deriv_copyed ? deriv32 : deriv).sum(axes_to_reduce, dbeta);
}

void BatchNormalizationLayer::exportTo(
1 change: 1 addition & 0 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
@@ -294,6 +294,7 @@ const Tensor RunLayerContext::getIncomingDerivative(unsigned int idx) const {
return getOutputGrad(idx);
}


/**
* @brief Get the Input tensor object
*
29 changes: 29 additions & 0 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
@@ -210,6 +210,35 @@ class InitLayerContext {
return weights_spec.size() - 1;
}

/**
* @brief Request a new weight for the layer
*
* @param dim dimension of Variable of the weight
* @param dim_g dimension of Gradient of the weight
* @param init initializer for the weight
* @param reg regularizer for the weight
* @param reg_const regularization constant for the weight
* @param name name of the weight
* @param trainable if the weight is trainable (require gradient or not)
* @return unsigned int index of the weight for its getter
*
* @todo Consider providing a guarantee that the returned indices will always
* start from 0 and will always be incremental.
*/
unsigned int requestWeight(const TensorDim &dim, const TensorDim &dim_g,
const Tensor::Initializer init,
const WeightRegularizer reg, const float reg_const,
const float decay, const std::string &name,
bool trainable = true, unsigned int out_axis = 3) {

/** @note : We assumes the gradient type is same with Activation data
* type.*/
weights_spec.emplace_back(dim, dim_g, init, reg, reg_const, decay,
clip_by_global_norm, trainable,
prefix + ":" + name, out_axis, loss_scale);
return weights_spec.size() - 1;
}

/**
* @brief Request a new weight for the layer
*
53 changes: 49 additions & 4 deletions nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@
#include <utility>

#include <activation_layer.h>
#include <bn_layer.h>
#include <app_context.h>
#include <base_properties.h>
#include <common_properties.h>
@@ -460,9 +461,11 @@ void LayerNode::exportTo(Exporter &exporter,
layer->exportTo(exporter, method);
}

void LayerNode::read(std::ifstream &file, bool opt_var) {
void LayerNode::read(std::ifstream &file, bool opt_var,
ml::train::ExecutionMode mode) {
NNTR_THROW_IF(!run_context, std::runtime_error)
<< __func__ << " layer needs to be finalized first!";

if (opt_var) {
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i) && getTrainable()) {
@@ -473,10 +476,29 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
}
}
} else {

for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
/// @note shared weights are only be read at the first acecss
if (run_context->isGradientLastAccess(i)) {
run_context->getWeight(i).read(file);
if (layer->getType() == BatchNormalizationLayer::type) {
if ((mode == ml::train::ExecutionMode::TRAIN) &&
(this->getWeightDataType() != TensorDim::DataType::FP32)) {

/** @note for batch normalization layer, we do need full precision
* for training. but weight can be saved with other type. for
* training, bn weight type is fixed with full precsion */

TensorDim dim = run_context->getWeight(i).getDim();
dim.setDataType(this->getWeightDataType());
Tensor T_read(dim, true);
T_read.read(file);
run_context->getWeight(i).copyData(T_read);
} else {
run_context->getWeight(i).read(file);
}
} else {
run_context->getWeight(i).read(file);
}
if (run_context->isMixedPrecision(i) && getTrainable()) {
run_context->getWeightFP32(i).copyData(run_context->getWeight(i));
}
@@ -485,7 +507,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
}
}

void LayerNode::save(std::ofstream &file, bool opt_var) const {
void LayerNode::save(std::ofstream &file, bool opt_var,
ml::train::ExecutionMode mode) const {
NNTR_THROW_IF(!run_context, std::runtime_error)
<< __func__ << " layer needs to be finalized first!";

@@ -505,7 +528,29 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
// @note shared weights are only be saved at the first access
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i)) {
run_context->getWeight(i).save(file);

/** @note For batch normalization layer, we do need full precision for
* training and the data type of weight is full precision. But for
* inference, We do have to save them as activation data type. */

if (layer->getType() == BatchNormalizationLayer::type) {
if ((mode == ml::train::ExecutionMode::TRAIN) &&
(this->getWeightDataType() != TensorDim::DataType::FP32)) {
TensorDim dim = run_context->getWeight(i).getDim();

dim.setDataType(this->getWeightDataType());

Tensor T_save(dim, true);

T_save.copyData(run_context->getWeight(i));

T_save.save(file);
} else {
run_context->getWeight(i).save(file);
}
} else {
run_context->getWeight(i).save(file);
}
}
}
}
Loading

0 comments on commit a5b1545

Please sign in to comment.