diff --git a/.clang-format b/.clang-format index 17cfc2415..19d6c8bfb 100644 --- a/.clang-format +++ b/.clang-format @@ -2,7 +2,7 @@ Language: Cpp # BasedOnStyle: LLVM AccessModifierOffset: -2 -AlignAfterOpenBracket: Align +AlignAfterOpenBracket: BlockIndent AlignArrayOfStructures: None AlignConsecutiveMacros: None AlignConsecutiveAssignments: None @@ -26,8 +26,8 @@ AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: MultiLine AttributeMacros: - __capability -BinPackArguments: true -BinPackParameters: true +BinPackArguments: false +BinPackParameters: false BraceWrapping: AfterCaseLabel: false AfterClass: false diff --git a/example-models b/example-models index d40894b03..3cfbcfd06 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit d40894b03f840a32da43a5adea0531ffc1db216e +Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h index fb72460b9..d610dab2c 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h @@ -112,10 +112,20 @@ template void relu1(data_T data[C // Sigmoid Activation // ************************************************* -template -void ac_sigmoid_pwl_wrapper(const ac_fixed(&input) /*[K]*/, - ac_fixed(&output) /*[K]*/) { +template < + /*unsigned K,*/ int W1, + int I1, + bool S1, + ac_q_mode Q1, + ac_o_mode O1, + int W2, + int I2, + bool S2, + ac_q_mode Q2, + ac_o_mode O2> +void ac_sigmoid_pwl_wrapper( + const ac_fixed(&input) /*[K]*/, ac_fixed(&output) /*[K]*/ +) { ac_fixed tmp; //[K]; ac_math::ac_sigmoid_pwl(input, tmp); output = tmp; @@ -541,12 +551,42 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #else // This is a workaround to help the template deduction to work correctly and fix the inconsistency that HLS4ML expects // softmax output to be signed but AC Math softmax knows it is always unsigned -template +template < + unsigned K, + int W1, + int I1, + bool S1, + ac_q_mode Q1, + ac_o_mode O1, + int W2, + int I2, + bool S2, + ac_q_mode Q2, + ac_o_mode O2> void ac_softmax_pwl_wrapper(const ac_fixed (&input)[K], ac_fixed (&output)[K]) { ac_fixed tmp[K]; - ac_math::ac_softmax_pwl(input, tmp); + ac_math::ac_softmax_pwl< + AC_TRN, + false, + 0, + 0, + AC_TRN, + AC_WRAP, + false, + 0, + 0, + AC_TRN, + AC_WRAP, + K, + W1, + I1, + S1, + Q1, + O1, + W2, + I2, + Q2, + O2>(input, tmp); for (unsigned int x = 0; x < K; x++) output[x] = tmp[x]; } @@ -785,8 +825,18 @@ void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } #else -template +template < + ac_q_mode pwl_Q = AC_TRN, + int W, + int I, + bool S, + ac_q_mode Q, + ac_o_mode O, + int outW, + int outI, + bool outS, + ac_q_mode outQ, + ac_o_mode outO> void ac_softplus_pwl_wrapper(const ac_fixed(&input), ac_fixed(&output)) { ac_fixed tmp; ac_math::ac_softplus_pwl(input, tmp); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_array.h b/hls4ml/templates/catapult/nnet_utils/nnet_array.h old mode 100755 new mode 100644 index cd3b73cf7..a99d2cf22 --- a/hls4ml/templates/catapult/nnet_utils/nnet_array.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_array.h @@ -24,8 +24,10 @@ void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[ } template -void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width], - res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) { +void transpose_3d( + data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width], + res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width] +) { unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; unsigned dims_t[3]; dims_t[0] = dims[CONFIG_T::perm[0]]; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h index 1db18043e..709824bfb 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h @@ -28,9 +28,12 @@ struct batchnorm_config { }; template -void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], - typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], - typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { +void normalize( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_in], + typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias] +) { data_T cache; // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases @@ -81,8 +84,9 @@ struct batchnorm_quantized_tanh_config { }; template -void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in], - data_T threshold[CONFIG_T::n_in]) { +void normalize_binary_tanh( + data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in], data_T threshold[CONFIG_T::n_in] +) { //#pragma HLS PIPELINE //#pragma HLS ARRAY_PARTITION variable=res complete @@ -101,8 +105,12 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CON } template -void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in], - data_T threshold_hi[CONFIG_T::n_in], data_T threshold_lo[CONFIG_T::n_in]) { +void normalize_ternary_tanh( + data_T data[CONFIG_T::n_in], + ac_int<2, true> res[CONFIG_T::n_in], + data_T threshold_hi[CONFIG_T::n_in], + data_T threshold_lo[CONFIG_T::n_in] +) { //#pragma HLS PIPELINE //#pragma HLS ARRAY_PARTITION variable=res complete diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h index 48085f82d..98e734da3 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h @@ -14,8 +14,12 @@ namespace nnet { // **************************************************** template -void normalize(ac_channel &data, ac_channel &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], - typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { +void normalize( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias] +) { //#pragma HLS ARRAY_PARTITION variable=scale complete //#pragma HLS ARRAY_PARTITION variable=bias complete @@ -42,7 +46,8 @@ void normalize(ac_channel &data, ac_channel &res, typename CONFIG norm_index = j % CONFIG_T::n_filt; } out_data[j] = CONFIG_T::template product::product( - in_data[j], scale[norm_index]) + + in_data[j], scale[norm_index] + ) + bias[norm_index]; } @@ -54,8 +59,11 @@ void normalize(ac_channel &data, ac_channel &res, typename CONFIG // Merged Batch Normalization and Quantized Tanh // **************************************************** template -void normalize_binary_tanh(ac_channel &data, ac_channel, CONFIG_T::n_in>> &res, - typename data_T::value_type threshold[CONFIG_T::n_in]) { +void normalize_binary_tanh( + ac_channel &data, + ac_channel, CONFIG_T::n_in>> &res, + typename data_T::value_type threshold[CONFIG_T::n_in] +) { //#pragma HLS ARRAY_PARTITION variable=threshold complete BinaryNormLoop: @@ -76,9 +84,12 @@ void normalize_binary_tanh(ac_channel &data, ac_channel -void normalize_ternary_tanh(ac_channel &data, ac_channel, CONFIG_T::n_in>> &res, - typename data_T::value_type threshold_hi[CONFIG_T::n_in], - typename data_T::value_type threshold_lo[CONFIG_T::n_in]) { +void normalize_ternary_tanh( + ac_channel &data, + ac_channel, CONFIG_T::n_in>> &res, + typename data_T::value_type threshold_hi[CONFIG_T::n_in], + typename data_T::value_type threshold_lo[CONFIG_T::n_in] +) { //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h old mode 100755 new mode 100644 index e4db43682..4e30599e6 --- a/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h @@ -8,19 +8,22 @@ namespace nnet { template class FillConv1DBuffer { public: - static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition) { + static void fill_buffer( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition + ) { // To be implemented in subclasses } }; template class FillConv2DBuffer { public: - static void - fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition) { + static void fill_buffer( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition + ) { // To be implemented in subclasses } }; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h old mode 100755 new mode 100644 index 98e075d4a..a3717b731 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h @@ -33,9 +33,12 @@ struct conv1d_config { }; template -void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { if (CONFIG_T::strategy == nnet::latency) { conv_1d_latency_cl(data, res, weights, biases); } else { @@ -44,10 +47,12 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO } template -void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_1d_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::filt_width == 1); if (CONFIG_T::strategy == nnet::latency) { diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h old mode 100755 new mode 100644 index 0323b1ac4..df2b58dce --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h @@ -9,8 +9,8 @@ namespace nnet { // Computes multiplier limit // This function should not be synthesized into firmware template -int compute_multiplier_limit( - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]) { +int compute_multiplier_limit(typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt] +) { int n_mult = 0; for (int ii = 0; ii < CONFIG_T::out_width; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { @@ -39,10 +39,12 @@ int compute_multiplier_limit( } // end compute_n_mult template -void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_latency_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width]; typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt]; @@ -121,10 +123,12 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_1d_latency_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::filt_width == 1); typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan]; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h index 143a1271b..cc1a0ddab 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h @@ -7,8 +7,10 @@ namespace nnet { template -void im2col_1d(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]) { +void im2col_1d( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width] +) { // int index = 0; for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) { //#pragma HLS PIPELINE II=1 rewind @@ -30,9 +32,12 @@ void im2col_1d(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void conv_1d_full(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_full( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { data_T data_conv[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]; data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan]; res_T res_col[CONFIG_T::n_filt]; @@ -56,8 +61,11 @@ void conv_1d_full(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[ } template -void im2col_1d_cf_idx(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) { +void im2col_1d_cf_idx( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], + const int col +) { ChannelLoop: for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { //#pragma HLS PIPELINE II=1 rewind @@ -76,8 +84,11 @@ void im2col_1d_cf_idx(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void im2col_1d_cf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_width], const int col) { +void im2col_1d_cf( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_width], + const int col +) { int index = 0; ChannelLoop: for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) { @@ -97,10 +108,12 @@ void im2col_1d_cf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void conv_1d_resource_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_resource_cf( + data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; const int nout = CONFIG_T::n_filt; const int rufactor = CONFIG_T::reuse_factor; @@ -131,8 +144,11 @@ void conv_1d_resource_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], } template -void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) { +void im2col_1d_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], + const int col +) { int index = 0; KernelLoop: for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { @@ -152,8 +168,9 @@ void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan], - const int col) { +void im2col_1d_pointwise_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan], const int col +) { int index = 0; ChannelLoop: for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { @@ -170,10 +187,12 @@ void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_resource_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; const int nout = CONFIG_T::n_filt; const int rufactor = CONFIG_T::reuse_factor; @@ -203,10 +222,12 @@ void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_1d_resource_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::filt_width == 1); const int nin = CONFIG_T::n_chan; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h index 48f6244ce..f5d9e0207 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h @@ -16,15 +16,19 @@ void compute_scaled_indices_1d(const unsigned w_idx, ac_int::scale_index( - wp_idx + p); + wp_idx + p + ); pixel_idx[p] = CONFIG_T::pixels[sw_idx]; } } template -void conv_1d_encoded_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_encoded_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); ac_channel data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; @@ -52,15 +56,19 @@ void conv_1d_encoded_cl(ac_channel &data, ac_channel &res, //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor } compute_scaled_indices_1d(i_iw, pixel_idx); - compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, - biases, pixel_idx); + compute_output_encoded( + data.read(), data_window, res, res_pack, outputs_ready, weights, biases, pixel_idx + ); } } template -void conv_1d_buffer_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_buffer_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); @@ -76,9 +84,12 @@ void conv_1d_buffer_cl(ac_channel &data, ac_channel &res, } template -void conv_1d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void conv_1d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS inline region switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h old mode 100755 new mode 100644 index 01476a044..f439c6400 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h @@ -44,7 +44,8 @@ void conv_2d_cf( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { if (CONFIG_T::strategy == nnet::latency) { conv_2d_latency_cf(data, res, weights, biases); } else { @@ -57,7 +58,8 @@ void conv_2d_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { if (CONFIG_T::strategy == nnet::latency) { conv_2d_latency_cl(data, res, weights, biases); } else { @@ -66,10 +68,12 @@ void conv_2d_cl( } template -void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::filt_width == 1); if (CONFIG_T::strategy == nnet::latency) { diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h index 29dd8ca63..9c3155f7b 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h @@ -9,8 +9,9 @@ namespace nnet { // Computes multiplier limit // This function should not be synthesized into firmware template -int compute_multiplier_limit_conv2d(typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * - CONFIG_T::n_chan * CONFIG_T::n_filt]) { +int compute_multiplier_limit_conv2d( + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt] +) { int n_mult = 0; for (int oh = 0; oh < CONFIG_T::out_height; oh++) { @@ -52,10 +53,12 @@ void conv_2d_latency_cf( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { - typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * - CONFIG_T::filt_height * CONFIG_T::filt_width]; + typename CONFIG_T::accum_t mult + [CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width]; typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 @@ -175,10 +178,12 @@ void conv_2d_latency_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { - typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * - CONFIG_T::filt_height * CONFIG_T::filt_width]; + typename CONFIG_T::accum_t mult + [CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width]; typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 @@ -293,10 +298,12 @@ void conv_2d_latency_cl( } // end conv2d template -void pointwise_conv_2d_latency_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_2d_latency_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan]; typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h index c5e386b5e..55ffa355d 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h @@ -7,9 +7,11 @@ namespace nnet { template -void im2col_2d(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * - CONFIG_T::out_width]) { +void im2col_2d( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col + [CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width] +) { const int output_h = (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom - (CONFIG_T::dilation_height * (CONFIG_T::filt_height - 1) + 1)) / CONFIG_T::stride_height + @@ -52,9 +54,10 @@ void conv_2d_full( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - data_T data_conv[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * - CONFIG_T::out_width]; + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { + data_T data_conv + [CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]; data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; res_T res_col[CONFIG_T::n_filt]; @@ -77,9 +80,12 @@ void conv_2d_full( } template -void im2col_2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], - data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width], const int row, - const int col) { +void im2col_2d_cf( + data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width], + const int row, + const int col +) { const int channel_size = CONFIG_T::in_height * CONFIG_T::in_width; int index = 0; for (int channel = CONFIG_T::n_chan; channel--; data += channel_size) { @@ -111,7 +117,8 @@ void conv_2d_resource_cf( data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; const int nout = CONFIG_T::n_filt; const int rufactor = CONFIG_T::reuse_factor; @@ -147,9 +154,12 @@ void conv_2d_resource_cf( } template -void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], const int row, - const int col) { +void im2col_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const int row, + const int col +) { int index = 0; for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) { int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; @@ -173,8 +183,12 @@ void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ } template -void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::n_chan], const int row, const int col) { +void im2col_2d_pointwise_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::n_chan], + const int row, + const int col +) { int index = 0; int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; @@ -199,7 +213,8 @@ void conv_2d_resource_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; const int nout = CONFIG_T::n_filt; const int rufactor = CONFIG_T::reuse_factor; @@ -233,10 +248,12 @@ void conv_2d_resource_cl( } template -void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_2d_resource_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); const int nin = CONFIG_T::n_chan; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h index 7e76be12a..6dac73a08 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h @@ -9,27 +9,31 @@ namespace nnet { template -void compute_scaled_indices_2d(const unsigned h_idx, const unsigned w_idx, - ac_int *pixel_idx) { - const unsigned sh_idx = CONFIG_T::template scale_index_height::scale_index(h_idx); +void compute_scaled_indices_2d( + const unsigned h_idx, const unsigned w_idx, ac_int *pixel_idx +) { + const unsigned sh_idx = CONFIG_T:: + template scale_index_height::scale_index(h_idx); unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan); ComputeIndex: for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { // #pragma HLS UNROLL - unsigned sw_idx = CONFIG_T::template scale_index_width::scale_index(wp_idx + p); + unsigned sw_idx = + CONFIG_T::template scale_index_width:: + scale_index(wp_idx + p); pixel_idx[p] = CONFIG_T::pixels[sh_idx * CONFIG_T::min_width + sw_idx]; } } template void conv_2d_encoded_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); assert(CONFIG_T::filt_height == CONFIG_T::filt_width); @@ -60,8 +64,9 @@ void conv_2d_encoded_cl( //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor } compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); - compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, - biases, pixel_idx); + compute_output_encoded( + data.read(), data_window, res, res_pack, outputs_ready, weights, biases, pixel_idx + ); } } } @@ -69,9 +74,11 @@ void conv_2d_encoded_cl( // Line Buffer template void conv_2d_buffer_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] @@ -99,9 +106,11 @@ void conv_2d_buffer_cl( template void conv_2d_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS inline region switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h index 4d92cbf69..47cc2cf2e 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h @@ -77,10 +77,14 @@ template class scale_index_unscaled { }; template -void mult_buffer(ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], - res_T &res_pack, ac_channel &res_stream, unsigned &outputs_ready, - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void mult_buffer( + ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, + ac_channel &res_stream, + unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS INLINE typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; @@ -97,10 +101,12 @@ void mult_buffer(ac_channel data_window[CONFIG_T::k //#pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { dense_latency( - data, res, weights, biases); + data, res, weights, biases + ); } else { dense_resource( - data, res, weights, biases); + data, res, weights, biases + ); } CastLoop: @@ -126,12 +132,16 @@ void mult_buffer(ac_channel data_window[CONFIG_T::k } template -void compute_output_encoded(const data_T &in_elem, - ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], - ac_channel &res, res_T &res_pack, unsigned &outputs_ready, - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], - ac_int *pixel_idx) { +void compute_output_encoded( + const data_T &in_elem, + ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + ac_channel &res, + res_T &res_pack, + unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], + ac_int *pixel_idx +) { //#pragma HLS INLINE constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor; @@ -158,8 +168,9 @@ void compute_output_encoded(const data_T &in_elem, // Line Buffer Implementation (Phil's) // ************************************************* template -void kernel_shift_1d(const data_T &in_elem, - typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) { +void kernel_shift_1d( + const data_T &in_elem, typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan] +) { //#pragma HLS inline //#pragma HLS PIPELINE II = 1 @@ -188,7 +199,8 @@ void kernel_shift_1d(const data_T &in_elem, template void kernel_shift_2d( typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], - typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) { + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan] +) { //#pragma HLS inline // Shift kernel_window by one step to the left (manual shift operation) @@ -223,7 +235,8 @@ void shift_line_buffer( const data_T &in_elem, ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] [CONFIG_T::n_chan], - typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]) { + typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan] +) { //#pragma HLS PIPELINE @@ -245,8 +258,9 @@ void shift_line_buffer( LineBufferShift: for (unsigned i_ih = 1; i_ih < CONFIG_T::filt_height; i_ih++) { // #pragma HLS UNROLL - typename data_T::value_type pop_elem = line_buffer[i_ih - 1][i_ic].shift( - shift_buffer[CONFIG_T::filt_height - i_ih][i_ic]); // Shift the line buffer, return the popped pixel + typename data_T::value_type pop_elem = + line_buffer[i_ih - 1][i_ic].shift(shift_buffer[CONFIG_T::filt_height - i_ih][i_ic] + ); // Shift the line buffer, return the popped pixel shift_buffer[CONFIG_T::filt_height - i_ih - 1][i_ic] = pop_elem; // Popped element placed back into shift_buffer, one row up. } @@ -261,7 +275,8 @@ void compute_output_buffer_2d( [CONFIG_T::n_chan], ac_channel &res_stream, typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS INLINE // Thresholds @@ -294,10 +309,12 @@ void compute_output_buffer_2d( //#pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { dense_latency( - kernel_data, res_out, weights, biases); + kernel_data, res_out, weights, biases + ); } else { dense_resource( - kernel_data, res_out, weights, biases); + kernel_data, res_out, weights, biases + ); } // Pack output @@ -334,9 +351,11 @@ void compute_output_buffer_2d( // Conv 1D compute output template void compute_output_buffer_1d( - const data_T &in_elem, ac_channel &res_stream, + const data_T &in_elem, + ac_channel &res_stream, typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS INLINE // Thresholds @@ -365,10 +384,12 @@ void compute_output_buffer_1d( //#pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { dense_latency( - kernel_data, res_out, weights, biases); + kernel_data, res_out, weights, biases + ); } else { dense_resource( - kernel_data, res_out, weights, biases); + kernel_data, res_out, weights, biases + ); } // Pack output diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h index 64b927cc6..4e0aaa37d 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h @@ -33,9 +33,12 @@ struct dense_config { }; template -void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { //#pragma HLS inline if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h index f3f27b6db..2df155f96 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h @@ -28,8 +28,9 @@ namespace nnet { template -void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out], - typename CONFIG_T::accum_t weight) { +void fill_mult( + typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out], typename CONFIG_T::accum_t weight +) { for (unsigned k = 0; k < CONFIG_T::n_out; k++) { // #pragma HLS UNROLL if (k == index) @@ -38,9 +39,12 @@ void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult } template -void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_compressed( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h index 40e5cd2b9..a63aff2ff 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h @@ -11,9 +11,12 @@ namespace nnet { template -void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_latency( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; // Partial unroll config constexpr int prod1_unroll = diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h index 5bcd1a54b..93abc32a5 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h @@ -11,9 +11,12 @@ namespace nnet { template -void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_resource_rf_leq_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { const int rufactor = CONFIG_T::reuse_factor; const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); @@ -54,7 +57,8 @@ void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T:: //#pragma HLS UNROLL acc[out_index] += static_cast( - CONFIG_T::template product::product(data[in_index], weights[w_index])); + CONFIG_T::template product::product(data[in_index], weights[w_index]) + ); // Increment w_index w_index += rufactor; @@ -82,9 +86,12 @@ void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T:: } template -void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_resource_rf_gt_nin_rem0( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out); const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); @@ -136,7 +143,8 @@ void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG for (unsigned int im = 0; im < block_factor; im++) { //#pragma HLS UNROLL acc[out_index] += static_cast( - CONFIG_T::template product::product(data[in_index], weights[w_index])); + CONFIG_T::template product::product(data[in_index], weights[w_index]) + ); w_index += rufactor; if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) @@ -160,9 +168,12 @@ void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG } template -void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_resource_rf_gt_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { const int rufactor = CONFIG_T::reuse_factor; const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); @@ -242,9 +253,12 @@ void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n } template -void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_resource( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { //#pragma HLS INLINE region diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h index 665d2f43f..8b5514aa0 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h @@ -10,9 +10,12 @@ namespace nnet { template -void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_wrapper( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { //#pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor @@ -23,9 +26,12 @@ void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], } template -void dense(ac_channel &data_stream, ac_channel &res_stream, - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense( + ac_channel &data_stream, + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] +) { typename data_T::value_type data[CONFIG_T::n_in]; //#pragma HLS ARRAY_PARTITION variable=data complete diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h index 4cdf507f9..ac73e2f58 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_embed.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h @@ -21,8 +21,11 @@ struct embed_config { }; template -void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { +void embedding( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out] +) { //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor // This can save a few cycles, but it will create a large multiplexer due to diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h index 137810087..19212cd06 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h @@ -8,8 +8,11 @@ namespace nnet { template -void embedding(ac_channel &data, ac_channel &res, - typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { +void embedding( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out] +) { data_T in_data = data.read(); constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h index 7451110fb..fb2b36cd8 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h @@ -96,15 +96,15 @@ template typename CONFIG_T::edge_weight_t compute_edge_weight(t } template -inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, - exponent_T exponent) { +inline typename std::enable_if::value, dividend_T>::type +normalize_log2(dividend_T dividend, exponent_T exponent) { //#pragma HLS INLINE return dividend >> exponent; } template -inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, - exponent_T exponent) { +inline typename std::enable_if::value, dividend_T>::type +normalize_log2(dividend_T dividend, exponent_T exponent) { //#pragma HLS INLINE return dividend / std::pow(2., exponent); } @@ -300,8 +300,8 @@ template struct SingleVertexResSetter { }; template -inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local, - arrays_T &arrays) { +inline void +compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local, arrays_T &arrays) { //#pragma HLS INLINE Aggregators: @@ -355,8 +355,9 @@ inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays } template -inline void compute_output_base(arrays_T const &arrays, - typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) { +inline void compute_output_base( + arrays_T const &arrays, typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators] +) { //#pragma HLS INLINE OutFeatures: @@ -371,10 +372,12 @@ inline void compute_output_base(arrays_T const &arrays, } template -inline void -compute_vertex_output(arrays_T const &arrays, unsigned iv, - typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators], - res_setter_T &res_setter) { +inline void compute_vertex_output( + arrays_T const &arrays, + unsigned iv, + typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators], + res_setter_T &res_setter +) { //#pragma HLS INLINE typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators]; @@ -467,8 +470,9 @@ void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n } template -void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays, - res_T res[CONFIG_T::n_out_features]) { +void set_output( + output_biases_T const &output_transform_biases, arrays_T const &arrays, res_T res[CONFIG_T::n_out_features] +) { //#pragma HLS PIPELINE OutFeatures: @@ -533,8 +537,13 @@ void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, c current_arrays.set_means_normalized(nvtx, means_accum); } -template +template < + class prev_layer_t, + class current_layer_t, + class last_layer_t, + class nvtx_T, + class prev_arrays_T, + class last_arrays_T> inline typename std::enable_if::value>::type sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { //#pragma HLS INLINE @@ -542,8 +551,13 @@ sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &las distribute_aggregate(nvtx, prev_arrays, last_arrays); } -template +template < + class prev_layer_t, + class current_layer_t, + class last_layer_t, + class nvtx_T, + class prev_arrays_T, + class last_arrays_T> inline typename std::enable_if::value>::type sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { //#pragma HLS INLINE @@ -601,9 +615,11 @@ struct garnet_config { // vertices -> vertices template -typename std::enable_if::type -garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features] +) { //#pragma HLS DATAFLOW garnet_utils::WeightsAndMeans arrays; @@ -615,9 +631,11 @@ garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T // vertices -> out features template -typename std::enable_if::type -garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features] +) { //#pragma HLS DATAFLOW garnet_utils::Means arrays; @@ -631,9 +649,11 @@ garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T // vertices -> vertices template -typename std::enable_if::type -garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet_stack( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features] +) { //#pragma HLS DATAFLOW typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; @@ -645,17 +665,20 @@ garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], garnet_utils::aggregate(data, nvtx[0], arrays_first); - garnet_utils::sublayer(nvtx[0], arrays_first, - arrays_last); + garnet_utils::sublayer( + nvtx[0], arrays_first, arrays_last + ); garnet_utils::distribute(nvtx[0], arrays_last, res); } // vertices -> out features template -typename std::enable_if::type -garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet_stack( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features] +) { //#pragma HLS DATAFLOW typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; @@ -667,8 +690,9 @@ garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], garnet_utils::aggregate(data, nvtx[0], arrays_first); - garnet_utils::sublayer(nvtx[0], arrays_first, - arrays_last); + garnet_utils::sublayer( + nvtx[0], arrays_first, arrays_last + ); garnet_utils::OutputBiasNormalizer normalize_bias(nvtx[0]); @@ -677,9 +701,11 @@ garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], /* Reference (dumb) implementation returning (Vertices, Features) */ template -typename std::enable_if::type -garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet_ref( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features] +) { typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators]; typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate]; @@ -781,9 +807,11 @@ garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nv /* Reference (dumb) implementation returning (Features) - output averaged over vertices already */ template -typename std::enable_if::type -garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], - res_T res[CONFIG_T::n_out_features]) { +typename std::enable_if::type garnet_ref( + data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], + nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features] +) { typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]; garnet_ref(data, nvtx, vertex_res); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_image.h b/hls4ml/templates/catapult/nnet_utils/nnet_image.h old mode 100755 new mode 100644 index 26947fae0..b6dc45466 --- a/hls4ml/templates/catapult/nnet_utils/nnet_image.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_image.h @@ -16,8 +16,10 @@ struct resize_config { }; template -void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan], - data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) { +void resize_nearest( + data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan], + data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan] +) { int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; int x2, y2; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_math.h b/hls4ml/templates/catapult/nnet_utils/nnet_math.h index c25f7187b..80f7c2d15 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_math.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_math.h @@ -92,8 +92,10 @@ template void sincos_lut(const T &input, T output[2]) { if ((luTdex1 % (1 << (AP_MAX(T::width - T::iwidth - 12, 0)))) > (1 << (AP_MAX(T::width - T::iwidth - 13, 0)))) { luTdex = luTdex + 1; } - typedef ac_fixed + typedef ac_fixed< + AP_MAX((AP_MAX(T::width - T::iwidth - 3, 1) + T::width - T::iwidth - 12), 1), + AP_MAX(T::width - T::iwidth - 3, 1), + false> datatype; datatype x = (datatype)luTdex1; x = x >> AP_MAX(T::width - T::iwidth - 12, 0); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h index 9cba03071..624b36ebb 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h @@ -105,8 +105,11 @@ void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T } template -void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0], - res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) { +void concatenate1d( + input1_T data1[CONFIG_T::n_elem1_0], + input2_T data2[CONFIG_T::n_elem2_0], + res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { res[ii] = data1[ii]; } @@ -116,9 +119,11 @@ void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T: } template -void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { +void concatenate2d_0( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) { res[ii] = data1[ii]; } @@ -128,9 +133,11 @@ void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], } template -void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { +void concatenate2d_1( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj]; @@ -143,9 +150,11 @@ void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], } template -void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { +void concatenate2d( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1] +) { if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { concatenate2d_1(data1, data2, res); } else { @@ -154,10 +163,13 @@ void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], } template -void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + - CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { +void concatenate3d_0( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) { res[ii] = data1[ii]; } @@ -167,10 +179,13 @@ void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * } template -void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + - CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { +void concatenate3d_1( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { @@ -192,10 +207,13 @@ void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * } template -void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + - CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { +void concatenate3d_2( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2] +) { for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { @@ -215,10 +233,13 @@ void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * } template -void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], - input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], - res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + - CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { +void concatenate3d( + input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2] +) { if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { concatenate3d_2(data1, data2, res); } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_mult.h b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h old mode 100755 new mode 100644 index 7379eec48..48cfa4e87 --- a/hls4ml/templates/catapult/nnet_utils/nnet_mult.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h @@ -86,8 +86,8 @@ template class weight_exponential : public Product { // Construct the return type from the multiplication equivalent to the largest shifts // ap_int is the type if the multiplicand equivalent to the largest lshift << // ap_fixed is the type of the multiplicand equivalent to the largest rshift >> - using r_T = decltype(x_T(0) * (ac_int(1) + - ac_fixed(1))); + using r_T = + decltype(x_T(0) * (ac_int(1) + ac_fixed(1))); static r_T product(x_T a, w_T w) { // Shift product for exponential weights //#pragma HLS INLINE @@ -101,17 +101,17 @@ template class weight_exponential : public Product { } // namespace product template -inline typename std::enable_if>::value && - std::is_same>::value, - ac_int>::type +inline typename std::enable_if< + std::is_same>::value && std::is_same>::value, + ac_int>::type cast(typename CONFIG_T::accum_t x) { return (ac_int)(x - CONFIG_T::n_in / 2) * 2; } template -inline typename std::enable_if>::value && - !std::is_same>::value, - res_T>::type +inline typename std::enable_if< + std::is_same>::value && !std::is_same>::value, + res_T>::type cast(typename CONFIG_T::accum_t x) { return (res_T)x; } diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_padding.h b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h old mode 100755 new mode 100644 index 47986523f..4c8cf4b95 --- a/hls4ml/templates/catapult/nnet_utils/nnet_padding.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h @@ -68,8 +68,10 @@ struct padding2d_config { }; template -void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], - data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { +void zeropad2d_cf( + data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width] +) { //#pragma HLS PIPELINE for (int k = 0; k < CONFIG_T::n_chan; k++) { @@ -101,8 +103,10 @@ void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T: } template -void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], - res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { +void zeropad2d_cl( + data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width] +) { //#pragma HLS PIPELINE for (int i = 0; i < CONFIG_T::pad_top; i++) { diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h index d6ab38a96..6cc2d2975 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h @@ -200,8 +200,10 @@ template constexpr int pool_op_limit() { } template -void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { +void pooling2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt] +) { constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor @@ -236,9 +238,9 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ img_overlap++; } } else { - pool[kk * CONFIG_T::stride_width + ll] = - data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + - (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; + pool[kk * CONFIG_T::stride_width + ll] = data + [(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; img_overlap++; } } @@ -262,8 +264,10 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ } template -void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { +void pooling2d_cf( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt] +) { constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor @@ -298,9 +302,9 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ img_overlap++; } } else { - pool[kk * CONFIG_T::stride_width + ll] = - data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + - ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left]; + pool[kk * CONFIG_T::stride_width + ll] = data + [(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + + ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left]; img_overlap++; } } @@ -324,8 +328,9 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ } template -void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], - res_T res[CONFIG_T::n_filt]) { +void global_pooling2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); assert(CONFIG_T::pool_width == CONFIG_T::stride_width); diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h index 051a27a54..ac0857a94 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h @@ -34,9 +34,14 @@ template void init_pool_table(unsigned template void compute_pool_encoded_2d( - const unsigned h_idx, const unsigned w_idx, const data_T &in_elem, + const unsigned h_idx, + const unsigned w_idx, + const data_T &in_elem, ac_channel data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt], - ac_channel &res, res_T &res_pack, unsigned &outputs_ready) { + ac_channel &res, + res_T &res_pack, + unsigned &outputs_ready +) { // Nearest H without unused pixels on the right constexpr unsigned nH = ((CONFIG_T::in_height - CONFIG_T::pool_height) / CONFIG_T::stride_height) * CONFIG_T::stride_height + @@ -90,7 +95,8 @@ void compute_pool_encoded_2d( for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { if (filt_mask > 0) data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write( - in_elem[p * CONFIG_T::n_filt + c]); + in_elem[p * CONFIG_T::n_filt + c] + ); } if (filt_mask == CONFIG_T::pool_height * CONFIG_T::pool_width) { @@ -104,11 +110,13 @@ void compute_pool_encoded_2d( 1) { // Saves resources if we don't pack output, compiler will remove the else branch res_pack[c] = reduce_pool( - pool_window); + pool_window + ); } else { res_pack[outputs_ready * CONFIG_T::n_filt + c] = reduce_pool( - pool_window); + pool_window + ); } } if (res_T::size / CONFIG_T::n_filt == @@ -152,8 +160,9 @@ void pooling2d_encoded_cl(ac_channel &data, ac_channel &res) { if (res_T::size / CONFIG_T::n_filt == 1) { //#pragma HLS PIPELINE II=pack_factor } - compute_pool_encoded_2d(i_ih, i_iw, data.read(), data_window, res, res_pack, - outputs_ready); + compute_pool_encoded_2d( + i_ih, i_iw, data.read(), data_window, res, res_pack, outputs_ready + ); } } } @@ -162,10 +171,12 @@ void pooling2d_encoded_cl(ac_channel &data, ac_channel &res) { // Line Buffer Implementation (Phil's) // ************************************************* template -void compute_pool_buffer_2d(const data_T &in_elem, - ap_shift_reg - line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt], - ac_channel &res) { +void compute_pool_buffer_2d( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1, 1)] + [CONFIG_T::n_filt], + ac_channel &res +) { //#pragma HLS INLINE const static int lShiftX = CONFIG_T::pool_width - 1; const static int lShiftY = CONFIG_T::pool_height - 1; @@ -200,8 +211,8 @@ void compute_pool_buffer_2d(const data_T &in_elem, // Compute Pooling res_pack[i_ic] = - reduce_pool( - pool_window); + reduce_pool(pool_window + ); } // Write to output @@ -266,9 +277,14 @@ template void pooling2d_cl(ac_cha // ************************************************* template -void compute_pool_encoded_1d(const unsigned w_idx, const data_T &in_elem, - ac_channel data_window[CONFIG_T::pool_width * CONFIG_T::n_filt], - ac_channel &res, res_T &res_pack, unsigned &outputs_ready) { +void compute_pool_encoded_1d( + const unsigned w_idx, + const data_T &in_elem, + ac_channel data_window[CONFIG_T::pool_width * CONFIG_T::n_filt], + ac_channel &res, + res_T &res_pack, + unsigned &outputs_ready +) { // Nearest W without unused pixels on the right constexpr unsigned nW = ((CONFIG_T::n_in - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + CONFIG_T::pool_width; @@ -481,7 +497,8 @@ void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_ data_pack[p] = in_elem[p * CONFIG_T::n_filt + c]; } data_window[c] = reduce_global_pool( - data_window[c], data_pack); + data_window[c], data_pack + ); } } diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h old mode 100755 new mode 100644 index f08d4d105..95039bc75 --- a/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h @@ -46,11 +46,16 @@ struct lstm_config { // Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE // dense network at the end) template -void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], - res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { +void lstm( + bool reset_state, + data_T data[CONFIG_T::n_in], + res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4] +) { // Initialize the state variable -- will maintain state between function calls typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4]; @@ -88,11 +93,13 @@ void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG } CONFIG_T::template activation_recr::activation( - inputacc_ifo, tmpres_ifo); + inputacc_ifo, tmpres_ifo + ); // Now for the confusion matrix CONFIG_T::template activation::activation( - inputacc_c, tmpres_c); + inputacc_c, tmpres_c + ); // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { @@ -101,7 +108,8 @@ void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG } // Operation: h=act(s)*o CONFIG_T::template activation::activation( - s_newstate, s_actstate); + s_newstate, s_actstate + ); for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { //#pragma HLS UNROLL @@ -110,12 +118,16 @@ void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG } template -void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], - res_T s_newstate[CONFIG_T::n_state], - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { +void lstm_static( + bool reset_state, + data_T data[CONFIG_T::n_in], + res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4] +) { static res_T h_state[CONFIG_T::n_state]; static res_T s_state[CONFIG_T::n_state]; // Initialize the state variable -- will maintain state between function calls @@ -148,8 +160,9 @@ void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate } nnet::dense(data, tmpres, param, param_b); - nnet::dense(h_state, tmpres_state, param_r, - param_br); + nnet::dense( + h_state, tmpres_state, param_r, param_br + ); for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) { //#pragma HLS UNROLL @@ -165,11 +178,13 @@ void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate } CONFIG_T::template activation_recr::activation( - inputacc_ifo, tmpres_ifo); + inputacc_ifo, tmpres_ifo + ); // Now for the confusion matrix CONFIG_T::template activation::activation( - inputacc_c, tmpres_c); + inputacc_c, tmpres_c + ); // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { @@ -179,7 +194,8 @@ void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate } // Operation: h=act(s)*o CONFIG_T::template activation::activation( - s_state, s_actstate); + s_state, s_actstate + ); for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { //#pragma HLS UNROLL @@ -189,11 +205,14 @@ void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate } template -void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { +void lstm_stack( + data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], + res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4] +) { res_T h_newstate[CONFIG_T::n_state]; res_T s_newstate[CONFIG_T::n_state]; @@ -214,11 +233,13 @@ void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CO data_in[j] = data[j + iloop * CONFIG_T::n_in]; } if (CONFIG_T::use_static) - nnet::lstm_static(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, - param_br); + nnet::lstm_static( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br + ); else - nnet::lstm(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, - param_br); + nnet::lstm( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br + ); if (CONFIG_T::n_sequence_out > 1) for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) { //#pragma HLS UNROLL @@ -234,11 +255,14 @@ void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CO } template -void lstm_stack(ac_channel &data_stream, ac_channel &res_stream, - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { +void lstm_stack( + ac_channel &data_stream, + ac_channel &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4] +) { typename res_T::value_type h_newstate[CONFIG_T::n_state]; typename res_T::value_type s_newstate[CONFIG_T::n_state]; @@ -267,10 +291,12 @@ void lstm_stack(ac_channel &data_stream, ac_channel &res_stream, } if (CONFIG_T::use_static) nnet::lstm_static( - reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br + ); else nnet::lstm( - reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br + ); if (CONFIG_T::n_sequence_out > 1) { res_T res_pack; //#pragma HLS DATA_PACK variable=res_pack @@ -324,12 +350,16 @@ struct gru_config { }; template -void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param - // weights - refer page in copy!! - typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { +void gru( + bool reset_state, + data_T data[CONFIG_T::n_in], + res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param + // weights - refer page in copy!! + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3] +) { // Initialize the state variable -- will maintain state between function calls typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3]; typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3]; @@ -349,8 +379,9 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ //#pragma HLS ARRAY_PARTITION variable=inputacc_h complete nnet::dense(data, tmpres, param, param_b); - nnet::dense(h_newstate, tmpres_state_zr, param_zr, - param_br); + nnet::dense( + h_newstate, tmpres_state_zr, param_zr, param_br + ); // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres // initialized with biases -- DONE @@ -361,8 +392,10 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ } // Activation function Sub layer -- START - CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + CONFIG_T::template activation_recr< + typename CONFIG_T::accum_t, + typename CONFIG_T::weight_t, + typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr); // Activation function Sub layer -- END @@ -380,8 +413,8 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ } // Now run the activation on this guy - CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + CONFIG_T::template activation:: + activation(inputacc_h, tmpres_h); // Mix the stat with the previous state for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { @@ -391,11 +424,15 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ } template -void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { +void gru_static( + bool reset_state, + data_T data[CONFIG_T::n_in], + res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3] +) { // Initialize the state variable -- will maintain state between function calls static res_T h_state[CONFIG_T::n_state]; @@ -425,8 +462,9 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ } nnet::dense(data, tmpres, param, param_b); - nnet::dense(h_state, tmpres_state_zr, param_zr, - param_br); + nnet::dense( + h_state, tmpres_state_zr, param_zr, param_br + ); // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres // initialized with biases -- DONE @@ -437,8 +475,10 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ } // Activation function Sub layer -- START - CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + CONFIG_T::template activation_recr< + typename CONFIG_T::accum_t, + typename CONFIG_T::weight_t, + typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr); // Activation function Sub layer -- END @@ -456,8 +496,8 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ } // Now run the activation on this guy - CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + CONFIG_T::template activation:: + activation(inputacc_h, tmpres_h); // Mix the stat with the previous state for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { @@ -468,11 +508,14 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ } template -void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { +void gru_stack( + data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], + res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3] +) { res_T h_state[CONFIG_T::n_state]; data_T data_in[CONFIG_T::n_in]; @@ -509,11 +552,14 @@ void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CON } template -void gru_stack(ac_channel &data_stream, ac_channel &res_stream, - typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], - typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], - typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], - typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { +void gru_stack( + ac_channel &data_stream, + ac_channel &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3] +) { typename res_T::value_type h_newstate[CONFIG_T::n_state]; //#pragma HLS ARRAY_PARTITION variable=h_newstate complete @@ -538,10 +584,12 @@ void gru_stack(ac_channel &data_stream, ac_channel &res_stream, } if (CONFIG_T::use_static) nnet::gru_static( - reset_state, data_in, h_newstate, param, param_zr, param_b, param_br); + reset_state, data_in, h_newstate, param, param_zr, param_b, param_br + ); else - nnet::gru(reset_state, data_in, h_newstate, - param, param_zr, param_b, param_br); + nnet::gru( + reset_state, data_in, h_newstate, param, param_zr, param_b, param_br + ); if (CONFIG_T::n_sequence_out > 1) { res_T res_pack; //#pragma HLS DATA_PACK variable=res_pack diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h index eb5ef9f7d..d19557243 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h @@ -9,9 +9,12 @@ namespace nnet { template -void depthwise_conv_1d_encoded_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void depthwise_conv_1d_encoded_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); ac_channel data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; @@ -39,15 +42,19 @@ void depthwise_conv_1d_encoded_cl(ac_channel &data, ac_channel &r //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor } compute_scaled_indices_1d(i_iw, pixel_idx); - compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, - weights, biases, pixel_idx); + compute_depthwise_output_encoded( + data.read(), data_window, res, res_pack, outputs_ready, weights, biases, pixel_idx + ); } } template -void depthwise_conv_1d_buffer_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void depthwise_conv_1d_buffer_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); @@ -63,9 +70,12 @@ void depthwise_conv_1d_buffer_cl(ac_channel &data, ac_channel &re } template -void depthwise_conv_1d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void depthwise_conv_1d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: @@ -78,9 +88,12 @@ void depthwise_conv_1d_cl(ac_channel &data, ac_channel &res, } template -void pointwise_conv_1d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_1d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); assert(CONFIG_T::filt_width == 1); @@ -104,23 +117,28 @@ void pointwise_conv_1d_cl(ac_channel &data, ac_channel &res, } template -void separable_conv_1d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::depthwise_config::weight_t - depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::weight_t - pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], - typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { +void separable_conv_1d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { //#pragma HLS DATAFLOW ac_channel depthwise_res; unsigned res_depth = CONFIG_T::depthwise_config::out_width; //#pragma HLS STREAM variable=depthwise_res depth=res_depth - depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, - depthwise_biases); - pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, - pointwise_biases); + depthwise_conv_1d_cl( + data, depthwise_res, depthwise_weights, depthwise_biases + ); + pointwise_conv_1d_cl( + depthwise_res, res, pointwise_weights, pointwise_biases + ); } } // namespace nnet diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h index d98dd8c31..67d89f76a 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h @@ -11,7 +11,8 @@ void depthwise_conv_2d_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_chan], typename CONFIG_T::weight_t depthwise_weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t depthwise_biases[CONFIG_T::n_chan]) { + typename CONFIG_T::bias_t depthwise_biases[CONFIG_T::n_chan] +) { const int in_height = CONFIG_T::in_height; const int in_width = CONFIG_T::in_width; const int n_chan = CONFIG_T::n_chan; @@ -55,26 +56,32 @@ void depthwise_conv_2d_cl( } template -void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width * - CONFIG_T::depthwise_config::n_chan], - res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width * - CONFIG_T::pointwise_config::n_filt], - typename CONFIG_T::depthwise_config::weight_t - depthwise_weights[CONFIG_T::depthwise_config::filt_height * - CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::weight_t - pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], - typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { +void separable_conv_2d_cl( + data_T data + [CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width * CONFIG_T::depthwise_config::n_chan], + res_T + res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width * + CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::weight_t depthwise_weights + [CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * + CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { //#pragma HLS INLINE region - dw_res_T depthwise_results[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width * - CONFIG_T::depthwise_config::n_chan]; - depthwise_conv_2d_cl(data, depthwise_results, depthwise_weights, - depthwise_biases); - pointwise_conv_2d_cl(depthwise_results, res, pointwise_weights, - pointwise_biases); + dw_res_T depthwise_results + [CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width * + CONFIG_T::depthwise_config::n_chan]; + depthwise_conv_2d_cl( + data, depthwise_results, depthwise_weights, depthwise_biases + ); + pointwise_conv_2d_cl( + depthwise_results, res, pointwise_weights, pointwise_biases + ); } } // namespace nnet diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h index a4f7d4faa..a74533c28 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h @@ -11,9 +11,11 @@ namespace nnet { template void depthwise_conv_2d_encoded_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); assert(CONFIG_T::filt_height == CONFIG_T::filt_width); @@ -45,8 +47,9 @@ void depthwise_conv_2d_encoded_cl( // #pragma HLS PIPELINE II=CONFIG_T::reuse_factor // } compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); - compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, - weights, biases, pixel_idx); + compute_depthwise_output_encoded( + data.read(), data_window, res, res_pack, outputs_ready, weights, biases, pixel_idx + ); } } } @@ -54,9 +57,11 @@ void depthwise_conv_2d_encoded_cl( // Line Buffer Implementation (Phil's) template void depthwise_conv_2d_buffer_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); static ap_shift_reg line_buffer[CONFIG_T::filt_height - 1] @@ -84,9 +89,11 @@ void depthwise_conv_2d_buffer_cl( template void depthwise_conv_2d_cl( - ac_channel &data, ac_channel &res, + ac_channel &data, + ac_channel &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { // #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: @@ -99,9 +106,12 @@ void depthwise_conv_2d_cl( } template -void pointwise_conv_2d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_conv_2d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); @@ -128,24 +138,29 @@ void pointwise_conv_2d_cl(ac_channel &data, ac_channel &res, } template -void separable_conv_2d_cl(ac_channel &data, ac_channel &res, - typename CONFIG_T::depthwise_config::weight_t - depthwise_weights[CONFIG_T::depthwise_config::filt_height * - CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::weight_t - pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], - typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], - typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { +void separable_conv_2d_cl( + ac_channel &data, + ac_channel &res, + typename CONFIG_T::depthwise_config::weight_t depthwise_weights + [CONFIG_T::depthwise_config::filt_height * CONFIG_T::depthwise_config::filt_width * + CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt] +) { // #pragma HLS DATAFLOW static ac_channel depthwise_res; unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; // #pragma HLS STREAM variable=depthwise_res depth=res_depth - depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, - depthwise_biases); - pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, - pointwise_biases); + depthwise_conv_2d_cl( + data, depthwise_res, depthwise_weights, depthwise_biases + ); + pointwise_conv_2d_cl( + depthwise_res, res, pointwise_weights, pointwise_biases + ); } } // namespace nnet diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h index 753d260a7..ecd9aaf51 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h @@ -9,9 +9,12 @@ namespace nnet { template -void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void depthwise_product( + data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { // #pragma HLS INLINE typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan]; @@ -36,7 +39,8 @@ void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], re for (int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) { // #pragma HLS UNROLL mult[ii] = CONFIG_T::mult_config::template product::product( - data[ii], weights[ii]); + data[ii], weights[ii] + ); } // Initialize accumulator with input biases @@ -66,10 +70,14 @@ void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], re } template -void depthwise_mult_buffer(ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], - res_T &res_pack, ac_channel &res_stream, unsigned &outputs_ready, - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void depthwise_mult_buffer( + ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, + ac_channel &res_stream, + unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { //#pragma HLS INLINE typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; @@ -114,10 +122,15 @@ void depthwise_mult_buffer(ac_channel data_window[C template void compute_depthwise_output_encoded( - const data_T &in_elem, ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], - ac_channel &res, res_T &res_pack, unsigned &outputs_ready, + const data_T &in_elem, + ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + ac_channel &res, + res_T &res_pack, + unsigned &outputs_ready, typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], ac_int *pixel_idx) { + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], + ac_int *pixel_idx +) { //#pragma HLS INLINE constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; @@ -142,9 +155,12 @@ void compute_depthwise_output_encoded( } template -void pointwise_mult_buffer(const data_T &data_pack, ac_channel &res_stream, - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { +void pointwise_mult_buffer( + const data_T &data_pack, + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt] +) { //#pragma HLS INLINE typename data_T::value_type data[CONFIG_T::n_chan]; @@ -165,10 +181,12 @@ void pointwise_mult_buffer(const data_T &data_pack, ac_channel &res_strea //#pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { dense_latency( - data, res, weights, biases); + data, res, weights, biases + ); } else { dense_resource( - data, res, weights, biases); + data, res, weights, biases + ); } CastLoop: @@ -182,9 +200,12 @@ void pointwise_mult_buffer(const data_T &data_pack, ac_channel &res_strea // Line Buffer Implementation (Phil's) template -void compute_depthwise_output_buffer_1d(const data_T &in_elem, ac_channel &res_stream, - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void compute_depthwise_output_buffer_1d( + const data_T &in_elem, + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { //#pragma HLS INLINE // Thresholds @@ -211,8 +232,9 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, ac_channel // Dense multiply //#pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product(kernel_data, res_out, - weights, biases); + depthwise_product( + kernel_data, res_out, weights, biases + ); } else { assert("Resource strategy for DepthwiseConv1D is not supported." && false); } @@ -240,12 +262,14 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, ac_channel } template -void compute_depthwise_output_buffer_2d(const data_T &in_elem, - ap_shift_reg - line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], - ac_channel &res_stream, - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { +void compute_depthwise_output_buffer_2d( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan], + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan] +) { //#pragma HLS INLINE // Thresholds @@ -276,8 +300,9 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem, // Dense multiply //#pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product(kernel_data, res_out, - weights, biases); + depthwise_product( + kernel_data, res_out, weights, biases + ); } else { assert("Resource strategy for DepthwiseConv2D is not supported." && false); } diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h index c76bfba5a..c1217f609 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h @@ -126,8 +126,9 @@ void broadcast_stream_1x1xC(ac_channel &data, ac_channel &res) { template void broadcast_stream_HxWx1(ac_channel &data, ac_channel &res) { - assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && - CONFIG_T::in_width == CONFIG_T::out_width); + assert( + CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width + ); BroadcastLoop: for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { //#pragma HLS PIPELINE @@ -146,8 +147,7 @@ template void broadcast_stream(ac_channel &data, ac_channel &res) { if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { broadcast_stream_1x1xC(data, res); - } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && - CONFIG_T::in_width == CONFIG_T::out_width) { + } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && CONFIG_T::in_width == CONFIG_T::out_width) { broadcast_stream_HxWx1(data, res); } } diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 082ae5dc8..d6d462410 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -19,8 +19,8 @@ struct MyProject { // kernel property method to config invocation interface auto get(sycl::ext::oneapi::experimental::properties_tag) { - return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>, - sycl::ext::intel::experimental::pipelined<>}; + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::streaming_interface<>, sycl::ext::intel::experimental::pipelined<>}; } SYCL_EXTERNAL void operator()() const; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index 13de5ab3b..f77f05a36 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -11,8 +11,8 @@ namespace nnet { // ************************************************* template void linear_stream() { LinearActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -31,8 +31,8 @@ template void linear_stream // ************************************************* template void relu_stream() { ReLUActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -58,10 +58,8 @@ template void leaky_relu_st constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; LeakyReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -84,8 +82,8 @@ template void leaky_relu_st template void thresholded_relu_stream(typename CONFIG_T::param_t theta) { ThresholdedReLUActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -113,10 +111,8 @@ template void elu_stream(ty constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; EluActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -145,8 +141,8 @@ template void selu_stream() #include "activation_tables/selu_table.tb" SeluActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -178,10 +174,8 @@ template void prelu_stream( constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; PReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -205,8 +199,8 @@ template void softplus_stre #include "activation_tables/softplus_table.tb" SoftplusActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -235,8 +229,8 @@ template void softsign_stre static const int MAX_VALUE = 8; SoftsignActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -281,10 +275,8 @@ template void softmax_stabl data_array[std::tuple_size::value_type>{}]; SoftmaxArrayLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_pack = data_pipe::read(); SoftmaxArrayPackLoop: @@ -295,14 +287,18 @@ template void softmax_stabl // Find the max and compute all delta(x_i, x_max) Op_max::value_type::value_type> op_max; - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = - reduce::value_type::value_type, - std::tuple_size::value_type>{}, - Op_max::value_type::value_type>>(data_array, op_max); + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = reduce< + typename ExtractPipeType::value_type::value_type, + std::tuple_size::value_type>{}, + Op_max::value_type::value_type>>(data_array, op_max); // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed::value_type::value_type::width, - ExtractPipeType::value_type::value_type::i_width, true, AC_RND, AC_SAT> + [[intel::fpga_register]] ac_fixed< + ExtractPipeType::value_type::value_type::width, + ExtractPipeType::value_type::value_type::i_width, + true, + AC_RND, + AC_SAT> d_xi_xmax[std::tuple_size::value_type>{}]; #pragma unroll for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { @@ -314,17 +310,18 @@ template void softmax_stabl typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; #pragma unroll for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - exp_res[j] = - exp_table[softmax_stable_idx_from_real_val::value_type::value_type, - CONFIG_T>(d_xi_xmax[j])]; + exp_res[j] = exp_table[softmax_stable_idx_from_real_val< + typename ExtractPipeType::value_type::value_type, + CONFIG_T>(d_xi_xmax[j])]; } // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce::value_type>{}, - Op_add>(exp_res, op_add); + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = reduce< + typename CONFIG_T::exp_table_t, + std::tuple_size::value_type>{}, + Op_add>(exp_res, op_add); [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; @@ -357,17 +354,16 @@ template void softmax_laten typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; SoftmaxExpLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_pack = data_pipe::read(); SoftmaxExpPackLoop: #pragma unroll for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val< - typename ExtractPipeType::value_type::value_type, CONFIG_T>(in_pack[j])]; + typename ExtractPipeType::value_type::value_type, + CONFIG_T>(in_pack[j])]; } // Explicitly sum the results with an adder tree. @@ -404,10 +400,8 @@ template void softmax_legac data_cache[std::tuple_size::value_type>{}]; SoftmaxInitLoop: - [[intel::initiation_interval(1)]] for (unsigned s = 0; - s < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - s++) { + [[intel::initiation_interval(1 + )]] for (unsigned s = 0; s < CONFIG_T::n_in / std::tuple_size::value_type>{}; s++) { auto in_pack = data_pipe::read(); SoftmaxInitPackLoop: @@ -456,8 +450,8 @@ template void softmax_legac } template void softmax_argmax_stream() { - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -470,7 +464,8 @@ template void softmax_argma [[intel::fpga_register]] int idx = 0; [[intel::initiation_interval(1)]] for (int i = 1; - i < std::tuple_size::value_type>{}; i++) { + i < std::tuple_size::value_type>{}; + i++) { if (in_data[i] > maximum) { maximum = in_data[i]; idx = i; @@ -514,10 +509,8 @@ template void dense_tanh_st constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; TanHActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -560,10 +553,8 @@ template void sigmoid_strea constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; SigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -604,10 +595,8 @@ template void hard_sigmoid_ constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -634,10 +623,8 @@ template void hard_tanh_str constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { + [[intel::initiation_interval(pipeline + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); typename ExtractPipeType::value_type out_data; @@ -662,8 +649,8 @@ template void hard_tanh_str // ************************************************* template void binary_tanh_stream() { BinaryTanHActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; @@ -686,8 +673,8 @@ template void binary_tanh_s // ************************************************* template void ternary_tanh_stream() { TernaryTanHActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + [[intel::initiation_interval(1 + )]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h index f8e5bcb79..3039ef05b 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h @@ -29,8 +29,9 @@ struct batchnorm_config { }; template -void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale, - const typename CONFIG_T::bias_t &bias) { +void normalize( + const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale, const typename CONFIG_T::bias_t &bias +) { // Calcuate result Result: #pragma unroll @@ -38,13 +39,15 @@ void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t if (CONFIG_T::n_filt == -1) { res[ires] = CONFIG_T::template product::product( - data[ires], scale[ires]) + + data[ires], scale[ires] + ) + bias[ires]; } else { int norm_index = ires % CONFIG_T::n_filt; res[ires] = CONFIG_T::template product::product( - data[ires], scale[norm_index]) + + data[ires], scale[norm_index] + ) + bias[norm_index]; } } @@ -82,8 +85,12 @@ void normalize_binary_tanh(const data_T &data, res_T &res, const typename CONFIG } template -void normalize_ternary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_hi_t &threshold_hi, - const typename CONFIG_T::threshold_lo_t &threshold_lo) { +void normalize_ternary_tanh( + const data_T &data, + res_T &res, + const typename CONFIG_T::threshold_hi_t &threshold_hi, + const typename CONFIG_T::threshold_lo_t &threshold_lo +) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { ac_int<2, true> cache; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h index 128b3ac1a..259ae5026 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -17,8 +17,9 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_ constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit; constexpr auto datasize = std::tuple_size::value_type>{}; - CONFIG_T::template product::value_type::value_type, - typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit); + CONFIG_T::template product< + typename ExtractPipeType::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit); BatchNormLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { @@ -33,10 +34,10 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_ norm_index = i * datasize + j; else norm_index = j % CONFIG_T::n_filt; - out_data[j] = - CONFIG_T::template product::value_type::value_type, - typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) + - bias[norm_index]; + out_data[j] = CONFIG_T::template product< + typename ExtractPipeType::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) + + bias[norm_index]; } res_pipe::write(out_data); @@ -72,8 +73,9 @@ void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) { } template -void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi, - typename CONFIG_T::threshold_lo_t threshold_lo) { +void normalize_ternary_tanh_stream( + typename CONFIG_T::threshold_hi_t threshold_hi, typename CONFIG_T::threshold_lo_t threshold_lo +) { constexpr auto datasize = std::tuple_size::value_type>{}; TernaryNormLoop: diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h index 38560f120..db35afde0 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h @@ -44,14 +44,16 @@ struct conv1d_config { }; template -void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_1d_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { conv_1d_resource_cl(data, res, weights, biases); } template -void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void pointwise_conv_1d_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { assert(CONFIG_T::filt_width == 1); pointwise_conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h index 85009d4a3..03ce26078 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h @@ -37,8 +37,9 @@ void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) { } template -void conv_1d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_1d_im2col_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { // im2col performs no filter transformations; therefore, filter size remains constant assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width); @@ -86,8 +87,9 @@ inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[ } template -void winograd_conv1d_3x1_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void winograd_conv1d_3x1_kernel_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { // Ensure Winograd conditions are met assert(CONFIG_T::filt_width == 3); assert(CONFIG_T::stride_width == 1); @@ -175,8 +177,9 @@ void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int } template -void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void pointwise_conv_1d_resource_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { assert(CONFIG_T::filt_width == 1); // Unroll factor for loop traversing input image, derived from parallelization_factor @@ -213,8 +216,9 @@ void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typenam // Top-level function - handles different implementations // **************************************************************** template -void conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_1d_resource_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { static constexpr bool winograd_conditions = // Winograd's minimal filtering algorithm not applicable to stride != 1 CONFIG_T::stride_width == 1 && diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h index 1ffd11774..3efe9ae43 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h @@ -58,7 +58,8 @@ void shift_line_buffer_1d( const data_T &in_elem, nnet::shift_reg line_buffer[CONFIG_T::n_chan], - typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) { + typename data_T::value_type shift_buffer[CONFIG_T::n_chan] +) { // For every channel, insert the incoming pixel at end of the shift buffer UpdateBuffer: #pragma unroll @@ -87,8 +88,12 @@ void compute_output_buffer_1d( const data_T &in_elem, nnet::shift_reg line_buffer[CONFIG_T::n_chan], - data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, - int &pX, int &sX) { + data_window_T &kernel_window, + const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases, + int &pX, + int &sX +) { using res_T = typename ExtractPipeType::value_type; @@ -153,22 +158,25 @@ void conv_1d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::b // Input image left-side padding PaddingLeftWidth: for (int col = 0; col < CONFIG_T::pad_left; col++) { - compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, - biases, pX, sX); + compute_output_buffer_1d( + padds, line_buffer, kernel_window, weights, biases, pX, sX + ); } // Read input image ReadInputWidth: for (int col = 0; col < CONFIG_T::in_width; col++) { - compute_output_buffer_1d(data_pipe::read(), line_buffer, - kernel_window, weights, biases, pX, sX); + compute_output_buffer_1d( + data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, sX + ); } // Input image right-side padding PaddingRightWidth: for (int col = 0; col < CONFIG_T::pad_right; col++) { - compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, - biases, pX, sX); + compute_output_buffer_1d( + padds, line_buffer, kernel_window, weights, biases, pX, sX + ); } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h index 79b1508c5..8c7bc7384 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h @@ -50,14 +50,16 @@ struct conv2d_config { }; template -void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_2d_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { conv_2d_resource_cl(data, res, weights, biases); } template -void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void pointwise_conv_2d_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); pointwise_conv_2d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h index 7265d90e1..c61a2e4b5 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -47,8 +47,9 @@ void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const } template -void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_2d_im2col_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { // im2col performs no filter transformations; therefore, filter size remains constant assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width); @@ -118,8 +119,9 @@ inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D } template -void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void winograd_conv2d_3x3_kernel_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { // Ensure Winograd conditions are met assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3); assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1); @@ -184,20 +186,24 @@ void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typenam // Explicitly transform intermediate result Z = A'YA and save to output res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] += - static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + - Y[10]); + static_cast( + Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + Y[10] + ); if ((col + 1) < CONFIG_T::out_height) res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] += - static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - - Y[11]); + static_cast( + Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - Y[11] + ); if ((row + 1) < CONFIG_T::out_width) res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] += - static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - - Y[13] - Y[14]); + static_cast( + Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - Y[13] - Y[14] + ); if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height) res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] += - static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - - Y[13] + Y[14]); + static_cast( + Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - Y[13] + Y[14] + ); } } } @@ -231,8 +237,9 @@ void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int } template -void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void pointwise_conv_2d_resource_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); // Unroll factors for loop traversing input image, derived from parallelization_factor @@ -272,8 +279,9 @@ void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typenam // Top-level function - handles different implementations // **************************************************************** template -void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void conv_2d_resource_cl( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { static constexpr bool winograd_conditions = // Winograd's minimal filtering algorithm not applicable to stride != 1 CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 && diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h index 08f0eaa87..04a8f5119 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h @@ -16,8 +16,9 @@ namespace nnet { * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved */ template -void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], - data_window_T &kernel_window) { +void kernel_shift_2d( + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], data_window_T &kernel_window +) { /* * Manually shift kernel_window by one step to the left * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix @@ -47,8 +48,9 @@ void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_hei KernelPushChannel: #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { - kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan + - channel] = shift_buffer[col][channel]; + kernel_window + [(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan + channel] = + shift_buffer[col][channel]; } } } @@ -69,7 +71,8 @@ void shift_line_buffer_2d( const data_T &in_elem, nnet::shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], - typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) { + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan] +) { // For every channel, insert the incoming pixel at end of the shift buffer UpdateBuffer: #pragma unroll @@ -114,8 +117,14 @@ void compute_output_buffer_2d( const data_T &in_elem, nnet::shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], - data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, - int &pX, int &pY, int &sX, int &sY) { + data_window_T &kernel_window, + const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases, + int &pX, + int &pY, + int &sX, + int &sY +) { using res_T = typename ExtractPipeType::value_type; @@ -196,8 +205,9 @@ void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::b [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_top; row++) { PaddingTopWidth: for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { - compute_output_buffer_2d(padds, line_buffer, kernel_window, - weights, biases, pX, pY, sX, sY); + compute_output_buffer_2d( + padds, line_buffer, kernel_window, weights, biases, pX, pY, sX, sY + ); } } @@ -206,22 +216,25 @@ void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::b // Input image left-side padding PaddingLeftWidth: for (int col = 0; col < CONFIG_T::pad_left; col++) { - compute_output_buffer_2d(padds, line_buffer, kernel_window, - weights, biases, pX, pY, sX, sY); + compute_output_buffer_2d( + padds, line_buffer, kernel_window, weights, biases, pX, pY, sX, sY + ); } // Read input image ReadInputWidth: for (int col = 0; col < CONFIG_T::in_width; col++) { compute_output_buffer_2d( - data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY); + data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY + ); } // Input image right-side padding PaddingRightWidth: for (int col = 0; col < CONFIG_T::pad_right; col++) { - compute_output_buffer_2d(padds, line_buffer, kernel_window, - weights, biases, pX, pY, sX, sY); + compute_output_buffer_2d( + padds, line_buffer, kernel_window, weights, biases, pX, pY, sX, sY + ); } } @@ -230,8 +243,9 @@ void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::b [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_bottom; row++) { PaddingBottomWidth: for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { - compute_output_buffer_2d(padds, line_buffer, kernel_window, - weights, biases, pX, pY, sX, sY); + compute_output_buffer_2d( + padds, line_buffer, kernel_window, weights, biases, pX, pY, sX, sY + ); } } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index dc7618908..39ce539e3 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -37,10 +37,13 @@ struct dense_config { }; template -void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { - assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && - "The current Reuse Factor is not allowed"); +void dense_rf_gt( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { + assert( + (CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed" + ); assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); //#pragma ii CONFIG_T::reuse_factor [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; @@ -74,7 +77,8 @@ void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight // Modified this tmp_acc[im] = CONFIG_T::template product::product( - data[data_index], weights[w_index]); + data[data_index], weights[w_index] + ); } [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; ResetMult: @@ -103,10 +107,13 @@ void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight } } template -void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { - assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && - "The current Reuse Factor is not allowed"); +void dense_rf_lt( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { + assert( + (CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed" + ); assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN"); [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; @@ -127,7 +134,8 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight // Modified this mult[im] = CONFIG_T::template product::product( - data[in_index], weights[w_index]); + data[in_index], weights[w_index] + ); in_index += CONFIG_T::reuse_factor; if (in_index >= CONFIG_T::n_in) in_index = ir; @@ -152,8 +160,9 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight } } template -void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void dense_resource( + const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases +) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { dense_rf_lt(data, res, weights, biases); } else { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 92c9adc3b..8f37d1564 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -13,8 +13,10 @@ void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_ [[intel::fpga_register]] typename ExtractPipeType::value_type res; [[intel::fpga_register]] auto data = data_pipe::read(); - dense_resource::value_type, typename ExtractPipeType::value_type, - CONFIG_T>(data, res, weights, biases); + dense_resource< + typename ExtractPipeType::value_type, + typename ExtractPipeType::value_type, + CONFIG_T>(data, res, weights, biases); res_pipe::write(res); } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h index d1262f437..9e1a5e35d 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h @@ -89,7 +89,8 @@ void dot1d(const input1_T &data1, const input2_T &data2, res_T &res) { #pragma unroll multiplier_limit for (int i = 0; i < CONFIG_T::n_in; i++) { mult[i] = CONFIG_T::template product::product( - data1[i], data2[i]); + data1[i], data2[i] + ); } [[intel::fpga_register]] typename CONFIG_T::accum_t acc = 0; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h index 60028ea52..ad5e924d3 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h @@ -85,7 +85,8 @@ template ::value_type::value_type>( - (in_data1[j] + in_data2[j]) / (typename ExtractPipeType::value_type::value_type)2); + (in_data1[j] + in_data2[j]) / (typename ExtractPipeType::value_type::value_type)2 + ); } res_pipe::write(out_data); @@ -108,7 +109,8 @@ template ::value_type::value_type>( - (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]); + (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j] + ); } res_pipe::write(out_data); @@ -131,7 +133,8 @@ template ::value_type::value_type>( - (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]); + (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j] + ); } res_pipe::write(out_data); diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h index c7dfc2d7c..88883a1f2 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h @@ -87,17 +87,17 @@ template class weight_exponential : public Product { // TO-DO: These may need extra variants if ac_int types are used in more places template -inline typename std::enable_if>::value && - std::is_same>::value, - ac_int>::type +inline typename std::enable_if< + std::is_same>::value && std::is_same>::value, + ac_int>::type cast(typename CONFIG_T::accum_t x) { return static_cast>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int()); } template -inline typename std::enable_if>::value && - !std::is_same>::value, - res_T>::type +inline typename std::enable_if< + std::is_same>::value && !std::is_same>::value, + res_T>::type cast(typename CONFIG_T::accum_t x) { return static_cast(x); } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h index d4ae91533..0886f44be 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h @@ -118,7 +118,9 @@ template void pooling1d_cl(const // Pooling operation res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = static_cast( pool_op( - pool, img_overlap)); + pool, img_overlap + ) + ); } } } @@ -139,7 +141,8 @@ template void global_pooling1d_cl } res[filt] = static_cast( - pool_op(pool)); + pool_op(pool) + ); } } @@ -210,9 +213,9 @@ template void pooling2d_cl(const img_overlap++; } else { // Current element is from input image - pool[pool_col * CONFIG_T::stride_width + pool_row] = - data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + - (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + pool[pool_col * CONFIG_T::stride_width + pool_row] = data + [(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; img_overlap++; } } @@ -221,9 +224,11 @@ template void pooling2d_cl(const // Pooling operation res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = - static_cast( - pool_op(pool, img_overlap)); + static_cast(pool_op< + typename data_T::value_type, + CONFIG_T::pool_height * CONFIG_T::pool_width, + CONFIG_T::pool_op, + typename CONFIG_T::accum_t>(pool, img_overlap)); } } } @@ -246,9 +251,11 @@ template void global_pooling2d_cl pool[i] = data[i * CONFIG_T::n_filt + filt]; } - res[filt] = static_cast( - pool_op(pool)); + res[filt] = static_cast(pool_op< + typename data_T::value_type, + CONFIG_T::in_height * CONFIG_T::in_width, + CONFIG_T::pool_op, + typename CONFIG_T::accum_t>(pool)); } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h index 9c30aab67..1c0894274 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h @@ -24,9 +24,13 @@ namespace nnet { * */ template -void compute_pool_buffer_1d(const data_T &in_elem, - nnet::shift_reg line_buffer[CONFIG_T::n_filt], - data_window_T &kernel_window, int &pX, int &sX) { +void compute_pool_buffer_1d( + const data_T &in_elem, + nnet::shift_reg line_buffer[CONFIG_T::n_filt], + data_window_T &kernel_window, + int &pX, + int &sX +) { using res_T = typename ExtractPipeType::value_type; @@ -59,7 +63,9 @@ void compute_pool_buffer_1d(const data_T &in_elem, // Step 3 - Pooling res_pack[filter] = static_cast( pool_op( - pool_window)); + pool_window + ) + ); } // Write result to output stream @@ -98,8 +104,9 @@ template void pooling1d_cl_ // Read input image ReadInputWidth: for (int col = 0; col < CONFIG_T::in_width; col++) { - compute_pool_buffer_1d(data_pipe::read(), line_buffer, kernel_window, - pX, sX); + compute_pool_buffer_1d( + data_pipe::read(), line_buffer, kernel_window, pX, sX + ); } } @@ -119,10 +126,16 @@ template void pooling1d_cl_ * */ template -void compute_pool_buffer_2d(const data_T &in_elem, - nnet::shift_reg - line_buffer[CONFIG_T::pool_height - 1][CONFIG_T::n_filt], - data_window_T &kernel_window, int &pX, int &pY, int &sX, int &sY) { +void compute_pool_buffer_2d( + const data_T &in_elem, + nnet::shift_reg line_buffer[CONFIG_T::pool_height - 1] + [CONFIG_T::n_filt], + data_window_T &kernel_window, + int &pX, + int &pY, + int &sX, + int &sY +) { using res_T = typename ExtractPipeType::value_type; @@ -154,9 +167,11 @@ void compute_pool_buffer_2d(const data_T &in_elem, } // Step 3 - Pooling - res_pack[filter] = static_cast( - pool_op(pool_window)); + res_pack[filter] = static_cast(pool_op< + typename data_T::value_type, + CONFIG_T::pool_height * CONFIG_T::pool_width, + CONFIG_T::pool_op, + typename CONFIG_T::accum_t>(pool_window)); } // Write result to output stream @@ -211,8 +226,9 @@ template void pooling2d_cl_ // Read input image ReadInputWidth: for (int col = 0; col < CONFIG_T::in_width; col++) { - compute_pool_buffer_2d(data_pipe::read(), line_buffer, - kernel_window, pX, pY, sX, sY); + compute_pool_buffer_2d( + data_pipe::read(), line_buffer, kernel_window, pX, pY, sX, sY + ); } } } @@ -239,7 +255,8 @@ template void compute_global_pool #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_filt; i++) { data_input[i] = reduce_global_pool( - data_input[i], in_elem[i]); + data_input[i], in_elem[i] + ); } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h index 4c20f28d1..20be7a855 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h @@ -98,9 +98,14 @@ struct gru_config { }; template -void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, - const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { +void gru_cell( + const data_T &x, + h_T &h, + const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, + const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias +) { static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor; // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and // candidate state gates, for each of the units @@ -113,8 +118,9 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weight // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights), // for update, reset and candidate state gates, for each of the units [[intel::fpga_register]] accum_array_T mat_mul_h_wr; - nnet::dense_resource(h, mat_mul_h_wr, recurrent_weights, - recurrent_bias); + nnet::dense_resource( + h, mat_mul_h_wr, recurrent_weights, recurrent_bias + ); // A vector containing both the values of z(t) and r(t) for every state using z_activ_array_T = array; @@ -129,8 +135,8 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weight // Activation on z(t) and r(t) [[intel::fpga_register]] z_activ_array_T z_r_act; - CONFIG_T::template activation_recr::activation(z_r, z_r_act); + CONFIG_T::template activation_recr:: + activation(z_r, z_r_act); // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h using h_activ_array_T = array; @@ -150,8 +156,9 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weight // Activation on candidate state [[intel::fpga_register]] h_activ_array_T h_cand_act; - CONFIG_T::template activation::activation(h_cand, - h_cand_act); + CONFIG_T::template activation::activation( + h_cand, h_cand_act + ); // Update state #pragma unroll recurrent_unroll_factor @@ -161,9 +168,14 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weight } template -void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, - const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { +void gru( + const data_T &data, + res_T &res, + const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, + const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias +) { using h_T = array; [[intel::fpga_register]] data_T x; @@ -229,8 +241,14 @@ struct simpleRNN_config { }; template -void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, const typename CONFIG_T::weight_t &kernel, - const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { +void simple_rnn_cell( + const in_T &inputs, + h_T &hidden_state, + h_T &hidden_state_o, + const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, + const typename CONFIG_T::bias_t &bias +) { using accum_array_T = array; // Weight multiplication @@ -243,8 +261,9 @@ void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, // Hidden state [[intel::fpga_register]] accum_array_T hiddenCand; - multiply_U(hidden_state, hiddenCand, - rec_kernel); + multiply_U( + hidden_state, hiddenCand, rec_kernel + ); // Vector addition [[intel::fpga_register]] accum_array_T afterAdd; @@ -255,8 +274,13 @@ void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, } template -void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &kernel, - const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { +void simple_rnn( + const data_T &data, + res_T &res, + const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, + const typename CONFIG_T::bias_t &bias +) { using in_T = array; using h_T = array; @@ -345,13 +369,25 @@ struct lstm_config { }; template -void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o, - const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, - const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, - const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, - const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, - const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t BF, - const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t BO) { +void lstm_cell( + const in_T &inputs, + h_T &hidden_state, + h_T &hidden_state_o, + h_T &cell_state, + h_T &cell_state_o, + const typename CONFIG_T::weight_i_t &WI, + const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, + const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, + const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, + const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, + const typename CONFIG_T::bias_f_t BF, + const typename CONFIG_T::bias_c_t &BC, + const typename CONFIG_T::bias_o_t BO +) { using accum_array_T = array; @@ -398,15 +434,17 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & add_bias(i_afterW, i_afterBias, BI); // Hidden Candidate - multiply_U(hidden_state, i_hiddenCand, - RWI); + multiply_U( + hidden_state, i_hiddenCand, RWI + ); // Vector addition add_vectors(i_afterBias, i_hiddenCand, i_afterAdd); // Activation CONFIG_T::template activation_recr::activation( - i_afterAdd, gate_i); + i_afterAdd, gate_i + ); //-----------Gate F Calculations // Weight multiplication @@ -416,15 +454,17 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & add_bias(f_afterW, f_afterBias, BF); // Hidden Candidate - multiply_U(hidden_state, f_hiddenCand, - RWF); + multiply_U( + hidden_state, f_hiddenCand, RWF + ); // Vector addition add_vectors(f_afterBias, f_hiddenCand, f_afterAdd); // Activation CONFIG_T::template activation_recr::activation( - f_afterAdd, gate_f); + f_afterAdd, gate_f + ); //-----------Gate C Calculations // Weight multiplication @@ -434,15 +474,17 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & add_bias(c_afterW, c_afterBias, BC); // Hidden Candidate - multiply_U(hidden_state, c_hiddenCand, - RWC); + multiply_U( + hidden_state, c_hiddenCand, RWC + ); // Vector addition add_vectors(c_afterBias, c_hiddenCand, c_afterAdd); // Activation CONFIG_T::template activation::activation( - c_afterAdd, gate_c); + c_afterAdd, gate_c + ); //-----------gate I and C multiply // Vector multiplication @@ -456,15 +498,17 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & add_bias(o_afterW, o_afterBias, BO); // Hidden Candidate - multiply_U(hidden_state, o_hiddenCand, - RWO); + multiply_U( + hidden_state, o_hiddenCand, RWO + ); // Vector addition add_vectors(o_afterBias, o_hiddenCand, o_afterAdd); // Activation CONFIG_T::template activation_recr::activation( - o_afterAdd, gate_o); + o_afterAdd, gate_o + ); //-----------Cell State Calculation // Vector multiplication @@ -476,7 +520,8 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & //-----------Forget gate Calculation // Activation CONFIG_T::template activation::activation( - cell_act_add, gate_forget); + cell_act_add, gate_forget + ); // Vector multiplication multiply_vectors(gate_o, gate_forget, h); @@ -490,12 +535,22 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T & } template -void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, - const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, - const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, - const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, - const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t &BF, - const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t &BO) { +void lstm( + const data_T &data, + res_T &res, + const typename CONFIG_T::weight_i_t &WI, + const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, + const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, + const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, + const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, + const typename CONFIG_T::bias_f_t &BF, + const typename CONFIG_T::bias_c_t &BC, + const typename CONFIG_T::bias_o_t &BO +) { // Note: currently this does not support recurrent bias @@ -533,8 +588,9 @@ void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &W } // Do LSTM - lstm_cell(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI, - BF, BC, BO); + lstm_cell( + in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI, BF, BC, BO + ); // Write result #pragma unroll diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h index 7429419cd..831fd1f41 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h @@ -7,8 +7,12 @@ namespace nnet { template -void gru_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::recurrent_weight_t recurrent_weights, - typename CONFIG_T::bias_t bias, typename CONFIG_T::recurrent_bias_t recurrent_bias) { +void gru_stream( + typename CONFIG_T::weight_t weights, + typename CONFIG_T::recurrent_weight_t recurrent_weights, + typename CONFIG_T::bias_t bias, + typename CONFIG_T::recurrent_bias_t recurrent_bias +) { using data_T = typename ExtractPipeType::value_type; using res_T = typename ExtractPipeType::value_type; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h index 2c4991a13..fbd436c80 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h @@ -21,8 +21,8 @@ template void transpose_2d(const template void transpose_3d(const data_T &data, res_T &res) { static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; - static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]], - dim_data[CONFIG_T::perm[2]]}; + static constexpr unsigned dim_res[3] = { + dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]], dim_data[CONFIG_T::perm[2]]}; int index_data[3] = {0}, index_res[3] = {0}; @@ -37,7 +37,8 @@ template void transpose_3d(const res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = static_cast( - data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]); + data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]] + ); } } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index 8cf883c1d..b88103da9 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -26,10 +26,20 @@ template constexpr T zero_array() { // This is a helper to extract the value_type of a pipe template struct ExtractPipeType { typedef T value_type; }; -template