Skip to content

Commit

Permalink
Add dsp and mve support to transpose conv int8 (ARM-software#103)
Browse files Browse the repository at this point in the history
* Adds new support functions to read and pad 2 int8s
 * Adds new support functions to allow addition to read and pad
 * Adds dsp optimizations for arm_nn_mat_mult_nt_t_s8_s32
 * Adds mve optimizations for arm_nn_mat_mult_nt_t_s8_s32
 * Adds mve requantization to arm_transpose_conv_s8
 * Adds new unit test

Signed-off-by: Ryan O'Shea <[email protected]>
  • Loading branch information
ArmRyan authored Feb 12, 2024
1 parent 2a999a2 commit 9eacdff
Show file tree
Hide file tree
Showing 16 changed files with 719 additions and 42 deletions.
1 change: 1 addition & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_x.c"/>
Expand Down
42 changes: 40 additions & 2 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 19 January 2024
* $Revision: V.18.0.0
* $Date: 31 January 2024
* $Revision: V.18.1.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -920,6 +920,44 @@ __STATIC_FORCEINLINE const int8_t *read_and_pad(const int8_t *source, int32_t *o
return source;
}

/**
* @brief read and expand one s8 word into two s16 words with ordering and addition.
*/
__STATIC_FORCEINLINE void read_pad_and_add_s8(const int8_t *source, int32_t *out1, int32_t *out2, const uint32_t add)
{
int32_t inA = arm_nn_read_s8x4(source);
int32_t inAbuf1 = SXTAB16_RORn(add, (uint32_t)inA, 8);
int32_t inAbuf2 = SXTAB16(add, inA);

#ifndef ARM_MATH_BIG_ENDIAN
*out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
*out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
#else
*out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
*out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
#endif
}

/**
* @brief read and expand two bytes into one word with ordering.
*/
__STATIC_FORCEINLINE void read_and_pad_s8x2(const int8_t *source, int32_t *out)
{
int16_t in = arm_nn_read_s8x2(source);
int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
*out = SXTB16(inA);
}

/**
* @brief read and expand two bytes into one word with ordering and addition.
*/
__STATIC_FORCEINLINE void read_pad_and_add_s8x2(const int8_t *source, int32_t *out, const uint32_t add)
{
int16_t in = arm_nn_read_s8x2(source);
int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
*out = SXTAB16(add, inA);
}

/**
* @brief read and expand one s8 word into two s16 words with no additional ordering.
*/
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE
Examples are Cortex-M55 or Cortex-M85 configured with MVE.

| Operator | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
| --------------- | ----------- | ---------- |------------| ------------| -------------|--------------| ------------| -------------|
| --------------- | ----------- | ---------- |------------|-------------| -------------|--------------|-------------| -------------|
| Conv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| TransposeConv2D | Yes | No | No | No | No | No | No | No |
| TransposeConv2D | Yes | No | No | Yes | No | No | Yes | No |
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| Mul | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
Expand Down
34 changes: 26 additions & 8 deletions Source/ConvolutionFunctions/arm_transpose_conv_s8.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <[email protected]>
* SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
Expand All @@ -19,10 +19,10 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_transpose_conv_s8.c
* Description: s8 version of convolution using symmetric quantization.
* Description: s8 version of transpose convolution using symmetric quantization.
*
* $Date: 5 October 2023
* $Revision: V.1.0.0
* $Date: 31 January 2024
* $Revision: V.1.1.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand Down Expand Up @@ -172,11 +172,30 @@ arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
}
}
}

img_data = img_buf_ptr;
for (int i = 0; i < output_x * output_y; i++)
{
for (int i_output_ch = 0; i_output_ch < output_ch; i_output_ch++)
#if defined(ARM_MATH_MVEI)
int output_ch_idx = 0;
int8_t *ip_out_data = output_data_ptr;
for (int32_t i_channel_rmdr = output_ch; i_channel_rmdr > 0; i_channel_rmdr -= 4)
{
mve_pred16_t p = vctp32q((uint32_t)i_channel_rmdr);
int32x4_t result = vldrwq_z_s32(&img_data[output_ch_idx], p);
result = arm_requantize_mve_32x4(result,
vldrwq_z_s32(&output_multiplier[output_ch_idx], p),
vldrwq_z_s32(&output_shift[output_ch_idx], p));
result = vaddq_n_s32(result, out_offset);
result = vmaxq_s32(result, vdupq_n_s32(activation_min));
result = vminq_s32(result, vdupq_n_s32(activation_max));
vstrbq_p_s32(ip_out_data, result, p);
ip_out_data += 4;
output_ch_idx += 4;
}
output_data_ptr += output_ch;
#else
int i_output_ch = 0;
for (; i_output_ch < output_ch; i_output_ch++)
{
int32_t result =
arm_nn_requantize(img_data[i_output_ch], output_multiplier[i_output_ch], output_shift[i_output_ch]);
Expand All @@ -185,13 +204,12 @@ arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
result = MIN(result, activation_max);
*output_data_ptr++ = (int8_t)result;
}
#endif
img_data += output_ch;
}

input_data_ptr += (input_size * input_ch);
batch_cnt--;
}

/* Return to application */
return ARM_CMSIS_NN_SUCCESS;
}
Expand Down
Loading

0 comments on commit 9eacdff

Please sign in to comment.