From 81beffff243b4bf4bc58515ad3d97f99cf883906 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Mon, 21 Oct 2024 10:08:36 -0700 Subject: [PATCH] Adding atan2 operator kernel optimization --- backends/cadence/aot/functions_hifi.yaml | 5 + backends/cadence/hifi/kernels/CMakeLists.txt | 1 + backends/cadence/hifi/kernels/kernels.h | 6 + .../cadence/hifi/operators/CMakeLists.txt | 1 + backends/cadence/hifi/operators/op_atan2.cpp | 211 ++++++++++++ .../third-party/nnlib/xa_nn_broadcast_32.c | 313 ++++++++++++++++++ 6 files changed, 537 insertions(+) create mode 100644 backends/cadence/hifi/operators/op_atan2.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 582bf178bf..40dc9f60f4 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -26,6 +26,11 @@ kernels: - arg_meta: null kernel_name: impl::HiFi::add_out + +- op: atan2.out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::atan2_out - op: bmm.out kernels: diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 90cd814e1e..5c610c15b4 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -9,6 +9,7 @@ add_library( cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 7601d96944..bc7aa70185 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -16,6 +16,12 @@ #include "xa_nnlib_kernels_api.h" /* Potential NNLIB function/APIs */ +extern "C" WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out, + const int *const out_shape, + WORD32* __restrict__ p_in, + const int * const in_shape, + int num_dims); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, const WORD32 *const p_out_shape, const FLOAT32 * __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 0bd117771f..c93dfca59a 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -21,6 +21,7 @@ endif() # ATen compliant ops that are needed to run this model. set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp new file mode 100644 index 0000000000..6d804fd230 --- /dev/null +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; + +namespace impl { +namespace HiFi { +namespace native { + +Tensor& atan2_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + constexpr auto name = "atan2.out"; + constexpr int kNnlibMaxDim = 16; + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); + bool optimized = 1; + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if (out_type != ScalarType::Float) + optimized = 0; + + if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; + + WORD32 num_elm = out.numel(); + + if (!optimized) { + if (broadcast) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[max_dim]; + WORD32 p_inp1_shape[max_dim]; + WORD32 p_inp2_shape[max_dim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = max_dim - out_dim; + int off_a = max_dim - a_dim; + int off_b = max_dim - b_dim; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i + off_o] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i + off_a] = a.size(i); + for (int i = 0; i < b_dim; i++) + p_inp2_shape[i + off_b] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2; + + vecatan2f(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + free(ptr2); + } else if (a_is_broadcasted && (!b_is_broadcasted)) { + FLOAT32* __restrict__ ptr1 = + (FLOAT32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = + (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[max_dim]; + WORD32 p_inp1_shape[max_dim]; + + for (int i = 0; i < max_dim; i++) { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = max_dim - out_dim; + int off_a = max_dim - a_dim; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i + off_o] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i + off_a] = a.size(i); + + xa_nn_broadcast_32_32( + (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + vecatan2f(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else if (b_is_broadcasted && (!a_is_broadcasted)) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[max_dim]; + WORD32 p_inp1_shape[max_dim]; + + for (int i = 0; i < max_dim; i++) { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = max_dim - out_dim; + int off_b = max_dim - b_dim; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i + off_o] = out.size(i); + for (int i = 0; i < b_dim; i++) + p_inp1_shape[i + off_b] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1; + + vecatan2f(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + vecatan2f(p_out, p_inp1, p_inp2, num_elm); + } + return out; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + torch::executor:: + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_OUT casted_a = static_cast(val_a); + CTYPE_OUT casted_b = static_cast(val_b); + return static_cast(std::atan2(casted_a, casted_b)); + }, + a, + b, + out); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 0000000000..cad3f1a25b --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +}