Add dsp and mve support to transpose conv int8 (ARM-software#103)

* Adds new support functions to read and pad 2 int8s * Adds new support functions to allow addition to read and pad * Adds dsp optimizations for arm_nn_mat_mult_nt_t_s8_s32 * Adds mve optimizations for arm_nn_mat_mult_nt_t_s8_s32 * Adds mve requantization to arm_transpose_conv_s8 * Adds new unit test Signed-off-by: Ryan O'Shea <[email protected]>
AdrianLundell · Feb 12, 2024 · 9eacdff · 9eacdff
1 parent 2a999a2
commit 9eacdff
Show file tree

Hide file tree

Showing 16 changed files with 719 additions and 42 deletions.
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -67,6 +67,7 @@
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_get_buffer_sizes_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
         <file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_x.c"/>

diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        19 January 2024
- * $Revision:    V.18.0.0
+ * $Date:        31 January 2024
+ * $Revision:    V.18.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -920,6 +920,44 @@ __STATIC_FORCEINLINE const int8_t *read_and_pad(const int8_t *source, int32_t *o
     return source;
 }
 
+/**
+ * @brief read and expand one s8 word into two s16 words with ordering and addition.
+ */
+__STATIC_FORCEINLINE void read_pad_and_add_s8(const int8_t *source, int32_t *out1, int32_t *out2, const uint32_t add)
+{
+    int32_t inA = arm_nn_read_s8x4(source);
+    int32_t inAbuf1 = SXTAB16_RORn(add, (uint32_t)inA, 8);
+    int32_t inAbuf2 = SXTAB16(add, inA);
+
+    #ifndef ARM_MATH_BIG_ENDIAN
+    *out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #else
+    *out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #endif
+}
+
+/**
+ * @brief read and expand two bytes into one word with ordering.
+ */
+__STATIC_FORCEINLINE void read_and_pad_s8x2(const int8_t *source, int32_t *out)
+{
+    int16_t in = arm_nn_read_s8x2(source);
+    int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
+    *out = SXTB16(inA);
+}
+
+/**
+ * @brief read and expand two bytes into one word with ordering and addition.
+ */
+__STATIC_FORCEINLINE void read_pad_and_add_s8x2(const int8_t *source, int32_t *out, const uint32_t add)
+{
+    int16_t in = arm_nn_read_s8x2(source);
+    int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
+    *out = SXTAB16(add, inA);
+}
+
 /**
  * @brief read and expand one s8 word into two s16 words with no additional ordering.
  */

diff --git a/README.md b/README.md
@@ -24,10 +24,10 @@ Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE
 Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 
 | Operator        | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
-| --------------- | ----------- | ---------- |------------| ------------| -------------|--------------| ------------| -------------|
+| --------------- | ----------- | ---------- |------------|-------------| -------------|--------------|-------------| -------------|
 | Conv2D          | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
 | DepthwiseConv2D | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
-| TransposeConv2D | Yes         | No         | No         | No          | No           | No           | No          | No           |
+| TransposeConv2D | Yes         | No         | No         | Yes         | No           | No           | Yes         | No           |
 | Fully Connected | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
 | Add             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |
 | Mul             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |

diff --git a/Source/ConvolutionFunctions/arm_transpose_conv_s8.c b/Source/ConvolutionFunctions/arm_transpose_conv_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <[email protected]>
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <[email protected]>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -19,10 +19,10 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
  * Title:        arm_transpose_conv_s8.c
- * Description:  s8 version of convolution using symmetric quantization.
+ * Description:  s8 version of transpose convolution using symmetric quantization.
  *
- * $Date:        5 October 2023
- * $Revision:    V.1.0.0
+ * $Date:        31 January 2024
+ * $Revision:    V.1.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -172,11 +172,30 @@ arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
                 }
             }
         }
-
         img_data = img_buf_ptr;
         for (int i = 0; i < output_x * output_y; i++)
         {
-            for (int i_output_ch = 0; i_output_ch < output_ch; i_output_ch++)
+#if defined(ARM_MATH_MVEI)
+            int output_ch_idx = 0;
+            int8_t *ip_out_data = output_data_ptr;
+            for (int32_t i_channel_rmdr = output_ch; i_channel_rmdr > 0; i_channel_rmdr -= 4)
+            {
+                mve_pred16_t p = vctp32q((uint32_t)i_channel_rmdr);
+                int32x4_t result = vldrwq_z_s32(&img_data[output_ch_idx], p);
+                result = arm_requantize_mve_32x4(result,
+                                                 vldrwq_z_s32(&output_multiplier[output_ch_idx], p),
+                                                 vldrwq_z_s32(&output_shift[output_ch_idx], p));
+                result = vaddq_n_s32(result, out_offset);
+                result = vmaxq_s32(result, vdupq_n_s32(activation_min));
+                result = vminq_s32(result, vdupq_n_s32(activation_max));
+                vstrbq_p_s32(ip_out_data, result, p);
+                ip_out_data += 4;
+                output_ch_idx += 4;
+            }
+            output_data_ptr += output_ch;
+#else
+            int i_output_ch = 0;
+            for (; i_output_ch < output_ch; i_output_ch++)
             {
                 int32_t result =
                     arm_nn_requantize(img_data[i_output_ch], output_multiplier[i_output_ch], output_shift[i_output_ch]);
@@ -185,13 +204,12 @@ arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
                 result = MIN(result, activation_max);
                 *output_data_ptr++ = (int8_t)result;
             }
+#endif
             img_data += output_ch;
         }
-
         input_data_ptr += (input_size * input_ch);
         batch_cnt--;
     }
-
     /* Return to application */
     return ARM_CMSIS_NN_SUCCESS;
 }