Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Wait for #2580] [ Mixed Precision ] Enable Mixed Precision #2581

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Applications/KNN/jni/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ e = executable('knn_sample',
install_dir: application_install_dir
)

test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
17 changes: 11 additions & 6 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,19 @@ warning_c_flags = [
'-Wno-error=varargs'
]

arch = host_machine.cpu_family()

if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
if get_option('platform') == 'tizen'
add_project_arguments(['-mavx2'], language: ['c','cpp'])
else
add_project_arguments(['-march=native'], language: ['c','cpp'])
endif
message('-march=native added for AVX hardware acceleration.')
endif

if get_option('enable-fp16')
arch = host_machine.cpu_family()
if get_option('platform') == 'android'
add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
extra_defines += '-DENABLE_FP16=1'
Expand Down Expand Up @@ -105,11 +115,6 @@ if get_option('enable-fp16')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
Expand Down
9 changes: 9 additions & 0 deletions nntrainer/graph/graph_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
return Sorted.at(ith);
}

const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
return sorted_node_map.at(name);
}

void GraphCore::makeAdjacencyList(
std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
/** initialize the adj list */
Expand Down Expand Up @@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {

if (Sorted.size() != node_list.size())
throw std::runtime_error("Internal error in topologicalSort");
unsigned int idx = 0;
for (auto n : Sorted) {
sorted_node_map[n->getName()] = idx;
idx++;
}
}

const std::shared_ptr<GraphNode> &
Expand Down
8 changes: 8 additions & 0 deletions nntrainer/graph/graph_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ class GraphCore {
*/
const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;

/**
* @brief getter of Sorted GraphNode index with name
* @param[in] layer name
* @ret index
*/
const unsigned int getSortedNodeIdx(const std::string &name) const;

/**
* @brief getter of GraphNode with node name
* @param[in] node name
Expand Down Expand Up @@ -252,6 +259,7 @@ class GraphCore {
std::vector<std::shared_ptr<GraphNode>>
node_list; /**< Unordered Node List */
std::unordered_map<std::string, int> node_map; /**< Unordered Node map */
std::unordered_map<std::string, int> sorted_node_map; /**< Unordered Node map */
std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List */
bool sorted; /** if the node_list is sorted */

Expand Down
127 changes: 95 additions & 32 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
continue;
}

if (rc.isGradientClipByGlobalNorm(i)) {
if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
/**
* @note the weights whose gradient are to be clipped by global norm will
* be clipped at once at the end of iteration and applied then.
Expand Down Expand Up @@ -393,56 +393,100 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
return out;
}

void NetworkGraph::backwarding(
bool NetworkGraph::backwarding(
int iteration,
std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &apply_grad_clip_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) const {
std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &lazy_apply_grad_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) {
/**
* last layer backwarding is run out of this loop
*/
auto iter_begin = getBackwardingBeginIter();
auto iter_end = getBackwardingEndIter();
bool has_nan = false;

/// there is no layer to train, so backwarding is essentially noop
if (iter_begin == iter_end) {
return;
return true;
}

auto const &lptr_begin = (*iter_begin);
// graph_const_reverse_iterator
auto iter_ = iter_begin;

if (lptr_begin->requireLabel() == false)
throw std::runtime_error(
"Error: last layer does not accept label, we can't train");

for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
auto &ln = *iter;
for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
auto &ln = *iter_;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
backwarding_op(ln, iteration);
has_nan = backwarding_op(ln, iteration);
PROFILE_TIME_END(profile_keys.at(ln->getType()));

if (has_nan) {
std::cout << "Gradient has NaN" << std::endl;
break;
}
}

/** perform clipping of the gradients by global norm if any */
if (clip_weights.empty())
return;
if (has_nan) {
/** if has NaN
* 1. reset the loss scale.
* 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
* 3. return false --> run backwarding again;
*/
float scale = (*iter_)->getRunContext().getLossScale();
float s = scale > 1.5f ? scale - 0.5f : 1.0f;

resetLossScale(s);

auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
forwarding_op(*iter, true);
PROFILE_TIME_END(profile_keys.at(ln->getType()));
}

/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
auto const &w = clip_weights[idx];
global_norm_data[idx] = w->getGradientNorm();
return false;
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
w->clipGradientByGlobalNorm(global_norm);

/** perform clipping of the gradients by global norm if any */
if (lazy_weights.empty())
return true;

if (is_clip_grad) {
/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
auto const &w = lazy_weights[idx];
global_norm_data[idx] = w->getGradientNorm();
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : lazy_weights) {
w->clipGradientByGlobalNorm(global_norm);
}
}
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
apply_grad_clip_op(*w, iteration);
for (auto w : lazy_weights) {
lazy_apply_grad_op(*w, iteration);
}
nan_count++;

if (nan_count > 10) {
float scale = (*iter_)->getRunContext().getLossScale();
float s = scale + 2.0f;
resetLossScale(s);
nan_count = 0;
}

return true;
}

LayerNode *NetworkGraph::computeBackwardEnd() {
Expand Down Expand Up @@ -768,9 +812,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
* node is going to be used with in-place optimizations.
*/
auto out_specs = init_context.getOutSpecs();

/// @note try move inplace control to finalize
bool shared_var = false, shared_grad = false;
if (lnode->executeInPlace() != InPlace::NONE) {
if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
for (unsigned int i = 0; i < out_specs.size(); ++i) {
auto &s = out_specs.at(i);
Expand Down Expand Up @@ -879,7 +924,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
lnode->getTrainable(), shared_weight_names),
inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1027,7 +1073,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
// TODO: update weights spec for trainable based on layer trainable prop
weights, inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1287,11 +1334,19 @@ int NetworkGraph::initialize(ExecutionMode mode,

/** select weights which would require clipping of the gradients by global
* norm if any */
clip_weights = tensor_manager->getWeights([](const Weight *w) {
lazy_weights = tensor_manager->getWeights([](const Weight *w) {
return w->hasGradient() && w->isGradientLastAccess() &&
w->isGradientClipByGlobalNorm();
(w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
});

is_clip_grad = false;
for (auto w : lazy_weights) {
if (w->isGradientClipByGlobalNorm()) {
is_clip_grad = true;
break;
}
}

return ML_ERROR_NONE;
}

Expand Down Expand Up @@ -1556,10 +1611,18 @@ void NetworkGraph::requestOptimizerVariable(
const TensorDim &dim = w->getDim();
std::vector<TensorDim> dims = cb(dim);
w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
Tensor::Initializer::ZEROS));
}
}
}

void NetworkGraph::resetLossScale(float scale) {
for (auto iter = cbegin(); iter != cend(); iter++) {
auto &ln = *iter;
ln->getRunContext().setLossScale(scale);
}
}

} /* namespace nntrainer */
28 changes: 21 additions & 7 deletions nntrainer/graph/network_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ class NetworkGraph {
optimize_memory(true),
exec_mode(ExecutionMode::TRAIN),
tensor_format("NCHW"),
tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {}
tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {
nan_count = 0;
}

/**
* @brief Constructor of NeuralNetwork Graph Class
Expand All @@ -73,7 +75,9 @@ class NetworkGraph {
optimize_memory(true),
exec_mode(ExecutionMode::TRAIN),
tensor_format(tensor_format_),
tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {}
tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {
nan_count = 0;
}

/**
* @brief Destructor of the NeuralNetwork Graph class
Expand Down Expand Up @@ -206,13 +210,14 @@ class NetworkGraph {
* @param[in] backwarding_op operation for the backwarding
* @param[in] apply_grad_clip_op operation for applying the clip gradients
*/
void backwarding(
bool backwarding(
int iteration,
std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &apply_grad_clip_op,
std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &lazy_apply_grad_op,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr) const;
void *user_data = nullptr);

/**
* @brief get begin iterator for the graph
Expand Down Expand Up @@ -444,6 +449,12 @@ class NetworkGraph {
getLayerExecutionOrders(const std::shared_ptr<LayerNode> &lnode);
#endif // ENABLE_TEST

/**
* @brief reset the loss scale
* @param[in] scale
*/
void resetLossScale(float scale);

private:
std::map<std::string, std::string> sub_in_out; /** This is map to identify
input and output layer name of subgraph */
Expand Down Expand Up @@ -480,7 +491,10 @@ class NetworkGraph {
std::unordered_map<std::string, int>
profile_keys; /**< profile keys based on the layer type */
std::vector<Weight *>
clip_weights; /**< weights with global norm based clipping enabled */
lazy_weights; /**< weights with global norm based clipping enabled */
bool is_clip_grad;

unsigned int nan_count;

/**
* @brief topological sort
Expand Down
19 changes: 16 additions & 3 deletions nntrainer/layers/input_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ namespace nntrainer {
static constexpr size_t SINGLE_INOUT_IDX = 0;

InputLayer::InputLayer() :
Layer(),
input_props(props::Normalization(), props::Standardization()) {}
Layer(), input_props(props::Normalization(), props::Standardization()) {}

void InputLayer::setProperty(const std::vector<std::string> &values) {
auto remain_props = loadProperties(values, input_props);
Expand All @@ -47,7 +46,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) {
Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
if (!context.executeInPlace()) {
Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
hidden_.copy(input_);
hidden_.copyData(input_);
}

if (std::get<props::Normalization>(input_props))
Expand All @@ -70,7 +69,21 @@ void InputLayer::finalize(InitLayerContext &context) {

std::vector<TensorDim> output_dims = context.getInputDimensions();

for (auto &d : output_dims) {
d.setDataType(context.getActivationDataType());
}

context.setOutputDimensions(output_dims);

is_inplace = true;

/**
* @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the
* activation data type is not fp32, then it does not support in-place
* operation.
*/
if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32)
is_inplace = false;
}

} /* namespace nntrainer */
Loading