fastmachinelearning · calad0i · Nov 13, 2024 · Nov 13, 2024
diff --git a/.clang-format b/.clang-format
@@ -2,7 +2,7 @@
 Language:        Cpp
 # BasedOnStyle:  LLVM
 AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
+AlignAfterOpenBracket: BlockIndent
 AlignArrayOfStructures: None
 AlignConsecutiveMacros: None
 AlignConsecutiveAssignments: None
@@ -26,8 +26,8 @@ AlwaysBreakBeforeMultilineStrings: false
 AlwaysBreakTemplateDeclarations: MultiLine
 AttributeMacros:
   - __capability
-BinPackArguments: true
-BinPackParameters: true
+BinPackArguments: false
+BinPackParameters: false
 BraceWrapping:
   AfterCaseLabel:  false
   AfterClass:      false

diff --git a/example-models b/example-models
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h
@@ -112,10 +112,20 @@ template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[C
 //       Sigmoid Activation
 // *************************************************
 
-template </*unsigned K,*/ int W1, int I1, bool S1, ac_q_mode Q1, ac_o_mode O1, int W2, int I2, bool S2, ac_q_mode Q2,
-          ac_o_mode O2>
-void ac_sigmoid_pwl_wrapper(const ac_fixed<W1, I1, S1, Q1, O1>(&input) /*[K]*/,
-                            ac_fixed<W2, I2, S2, Q2, O2>(&output) /*[K]*/) {
+template <
+    /*unsigned K,*/ int W1,
+    int I1,
+    bool S1,
+    ac_q_mode Q1,
+    ac_o_mode O1,
+    int W2,
+    int I2,
+    bool S2,
+    ac_q_mode Q2,
+    ac_o_mode O2>
+void ac_sigmoid_pwl_wrapper(
+    const ac_fixed<W1, I1, S1, Q1, O1>(&input) /*[K]*/, ac_fixed<W2, I2, S2, Q2, O2>(&output) /*[K]*/
+) {
     ac_fixed<W2, I2, false, Q2, O2> tmp; //[K];
     ac_math::ac_sigmoid_pwl<AC_TRN, W1, I1, true, Q1, O1, W2, I2, Q2, O2>(input, tmp);
     output = tmp;
@@ -541,12 +551,42 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 #else
 // This is a workaround to help the template deduction to work correctly and fix the inconsistency that HLS4ML expects
 // softmax output to be signed but AC Math softmax knows it is always unsigned
-template <unsigned K, int W1, int I1, bool S1, ac_q_mode Q1, ac_o_mode O1, int W2, int I2, bool S2, ac_q_mode Q2,
-          ac_o_mode O2>
+template <
+    unsigned K,
+    int W1,
+    int I1,
+    bool S1,
+    ac_q_mode Q1,
+    ac_o_mode O1,
+    int W2,
+    int I2,
+    bool S2,
+    ac_q_mode Q2,
+    ac_o_mode O2>
 void ac_softmax_pwl_wrapper(const ac_fixed<W1, I1, S1, Q1, O1> (&input)[K], ac_fixed<W2, I2, S2, Q2, O2> (&output)[K]) {
     ac_fixed<W2, I2, false, Q2, O2> tmp[K];
-    ac_math::ac_softmax_pwl<AC_TRN, false, 0, 0, AC_TRN, AC_WRAP, false, 0, 0, AC_TRN, AC_WRAP, K, W1, I1, S1, Q1, O1, W2,
-                            I2, Q2, O2>(input, tmp);
+    ac_math::ac_softmax_pwl<
+        AC_TRN,
+        false,
+        0,
+        0,
+        AC_TRN,
+        AC_WRAP,
+        false,
+        0,
+        0,
+        AC_TRN,
+        AC_WRAP,
+        K,
+        W1,
+        I1,
+        S1,
+        Q1,
+        O1,
+        W2,
+        I2,
+        Q2,
+        O2>(input, tmp);
     for (unsigned int x = 0; x < K; x++)
         output[x] = tmp[x];
 }
@@ -785,8 +825,18 @@ void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 #else
-template <ac_q_mode pwl_Q = AC_TRN, int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int outW, int outI, bool outS,
-          ac_q_mode outQ, ac_o_mode outO>
+template <
+    ac_q_mode pwl_Q = AC_TRN,
+    int W,
+    int I,
+    bool S,
+    ac_q_mode Q,
+    ac_o_mode O,
+    int outW,
+    int outI,
+    bool outS,
+    ac_q_mode outQ,
+    ac_o_mode outO>
 void ac_softplus_pwl_wrapper(const ac_fixed<W, I, S, Q, O>(&input), ac_fixed<outW, outI, outS, outQ, outO>(&output)) {
     ac_fixed<outW, outI, false, outQ, outO> tmp;
     ac_math::ac_softplus_pwl<AC_TRN, W, I, S, Q, O, outW, outI, outQ, outO>(input, tmp);

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_array.h b/hls4ml/templates/catapult/nnet_utils/nnet_array.h
@@ -24,8 +24,10 @@ void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+void transpose_3d(
+    data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+    res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]
+) {
     unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
     unsigned dims_t[3];
     dims_t[0] = dims[CONFIG_T::perm[0]];

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h
@@ -28,9 +28,12 @@ struct batchnorm_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+void normalize(
+    data_T data[CONFIG_T::n_in],
+    res_T res[CONFIG_T::n_in],
+    typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+    typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]
+) {
     data_T cache;
 
     // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
@@ -81,8 +84,9 @@ struct batchnorm_quantized_tanh_config {
 };
 
 template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
-                           data_T threshold[CONFIG_T::n_in]) {
+void normalize_binary_tanh(
+    data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in], data_T threshold[CONFIG_T::n_in]
+) {
     //#pragma HLS PIPELINE
     //#pragma HLS ARRAY_PARTITION variable=res complete
 
@@ -101,8 +105,12 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CON
 }
 
 template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
-                            data_T threshold_hi[CONFIG_T::n_in], data_T threshold_lo[CONFIG_T::n_in]) {
+void normalize_ternary_tanh(
+    data_T data[CONFIG_T::n_in],
+    ac_int<2, true> res[CONFIG_T::n_in],
+    data_T threshold_hi[CONFIG_T::n_in],
+    data_T threshold_lo[CONFIG_T::n_in]
+) {
     //#pragma HLS PIPELINE
     //#pragma HLS ARRAY_PARTITION variable=res complete
 

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h
@@ -14,8 +14,12 @@ namespace nnet {
 // ****************************************************
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+void normalize(
+    ac_channel<data_T> &data,
+    ac_channel<res_T> &res,
+    typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+    typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]
+) {
     //#pragma HLS ARRAY_PARTITION variable=scale complete
     //#pragma HLS ARRAY_PARTITION variable=bias complete
 
@@ -42,7 +46,8 @@ void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG
                 norm_index = j % CONFIG_T::n_filt;
             }
             out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                              in_data[j], scale[norm_index]) +
+                              in_data[j], scale[norm_index]
+                          ) +
                           bias[norm_index];
         }
 
@@ -54,8 +59,11 @@ void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG
 //       Merged Batch Normalization and Quantized Tanh
 // ****************************************************
 template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<1, false>, CONFIG_T::n_in>> &res,
-                           typename data_T::value_type threshold[CONFIG_T::n_in]) {
+void normalize_binary_tanh(
+    ac_channel<data_T> &data,
+    ac_channel<nnet::array<ac_int<1, false>, CONFIG_T::n_in>> &res,
+    typename data_T::value_type threshold[CONFIG_T::n_in]
+) {
     //#pragma HLS ARRAY_PARTITION variable=threshold complete
 
 BinaryNormLoop:
@@ -76,9 +84,12 @@ void normalize_binary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_i
 }
 
 template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<2, true>, CONFIG_T::n_in>> &res,
-                            typename data_T::value_type threshold_hi[CONFIG_T::n_in],
-                            typename data_T::value_type threshold_lo[CONFIG_T::n_in]) {
+void normalize_ternary_tanh(
+    ac_channel<data_T> &data,
+    ac_channel<nnet::array<ac_int<2, true>, CONFIG_T::n_in>> &res,
+    typename data_T::value_type threshold_hi[CONFIG_T::n_in],
+    typename data_T::value_type threshold_lo[CONFIG_T::n_in]
+) {
     //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete
     //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete
 

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h
@@ -8,19 +8,22 @@ namespace nnet {
 
 template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
   public:
-    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
-                            const unsigned partition) {
+    static void fill_buffer(
+        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+        const unsigned partition
+    ) {
         // To be implemented in subclasses
     }
 };
 
 template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
   public:
-    static void
-    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
-                const unsigned partition) {
+    static void fill_buffer(
+        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+        const unsigned partition
+    ) {
         // To be implemented in subclasses
     }
 };

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h
@@ -33,9 +33,12 @@ struct conv1d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
+) {
     if (CONFIG_T::strategy == nnet::latency) {
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     } else {
@@ -44,10 +47,12 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_1d_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
+) {
     assert(CONFIG_T::filt_width == 1);
 
     if (CONFIG_T::strategy == nnet::latency) {

diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h
@@ -9,8 +9,8 @@ namespace nnet {
 // Computes multiplier limit
 // This function should not be synthesized into firmware
 template <typename CONFIG_T>
-int compute_multiplier_limit(
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]) {
+int compute_multiplier_limit(typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]
+) {
     int n_mult = 0;
     for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
@@ -39,10 +39,12 @@ int compute_multiplier_limit(
 } // end compute_n_mult
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                        res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_latency_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
+) {
 
     typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width];
     typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt];
@@ -121,10 +123,12 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_1d_latency_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
+) {
     assert(CONFIG_T::filt_width == 1);
 
     typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan];