From db2afc1b39fb958a3bfa63b020f896afd028a057 Mon Sep 17 00:00:00 2001 From: dijopaul <87994875+dijopaul@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:19:44 +0530 Subject: [PATCH] Dijopaul optimization2 (#9) * Adding sub, sigmoid, permute, view_copy ops * Rushikesh code (#8) * Adding operators NNLIB interated operators * Adding operators NNLIB interated operators * Adding concat32_32 * Adding concat32_32 * Code cleanup * Delete backends/cadence/aot/functions.yaml --------- Co-authored-by: Rushi-cad --------- Co-authored-by: Rushi-cad --- backends/cadence/aot/functions.yaml | 136 -- backends/cadence/aot/functions_hifi.yaml | 172 +- backends/cadence/hifi/kernels/CMakeLists.txt | 8 + backends/cadence/hifi/kernels/kernels.h | 112 +- .../cadence/hifi/operators/CMakeLists.txt | 28 +- backends/cadence/hifi/operators/op_atan2.cpp | 212 ++ .../cadence/hifi/operators/op_bitwise_and.cpp | 310 +++ .../cadence/hifi/operators/op_bitwise_or.cpp | 309 +++ .../cadence/hifi/operators/op_bitwise_xor.cpp | 311 +++ backends/cadence/hifi/operators/op_cat.cpp | 140 ++ backends/cadence/hifi/operators/op_eq.cpp | 175 ++ .../hifi/operators/op_floor_divide.cpp | 130 ++ backends/cadence/hifi/operators/op_fmod.cpp | 262 +++ backends/cadence/hifi/operators/op_ge.cpp | 177 ++ backends/cadence/hifi/operators/op_gt.cpp | 177 ++ backends/cadence/hifi/operators/op_le.cpp | 177 ++ backends/cadence/hifi/operators/op_lt.cpp | 178 ++ backends/cadence/hifi/operators/op_ne.cpp | 176 ++ backends/cadence/hifi/operators/op_pow.cpp | 380 +++ .../cadence/hifi/operators/op_remainder.cpp | 233 ++ backends/cadence/hifi/operators/op_rsqrt.cpp | 7 +- backends/cadence/hifi/operators/op_where.cpp | 193 +- .../hifi/operators/quantized_conv_out.cpp | 479 +++- .../hifi/operators/quantized_matmul_out.cpp | 186 ++ .../hifi/operators/quantized_relu_out.cpp | 2 +- .../third-party/nnlib/xa_nn_broadcast_32.c | 313 +++ .../hifi/third-party/nnlib/xa_nn_concat_32.c | 172 ++ .../nnlib/xa_nn_elm_fmod_broadcast_f32.c | 525 +++++ .../nnlib/xa_nn_elm_logicalxor_bool_bool.c | 52 + .../nnlib/xa_nn_elm_remainder_broadcast_f32.c | 525 +++++ .../nnlib/xa_nn_greater_lesser_equal_f32.c | 2029 +++++++++++++++++ 31 files changed, 8005 insertions(+), 281 deletions(-) delete mode 100644 backends/cadence/aot/functions.yaml create mode 100644 backends/cadence/hifi/operators/op_atan2.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_and.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_or.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_xor.cpp create mode 100644 backends/cadence/hifi/operators/op_cat.cpp create mode 100644 backends/cadence/hifi/operators/op_eq.cpp create mode 100644 backends/cadence/hifi/operators/op_floor_divide.cpp create mode 100644 backends/cadence/hifi/operators/op_fmod.cpp create mode 100644 backends/cadence/hifi/operators/op_ge.cpp create mode 100644 backends/cadence/hifi/operators/op_gt.cpp create mode 100644 backends/cadence/hifi/operators/op_le.cpp create mode 100644 backends/cadence/hifi/operators/op_lt.cpp create mode 100644 backends/cadence/hifi/operators/op_ne.cpp create mode 100644 backends/cadence/hifi/operators/op_pow.cpp create mode 100644 backends/cadence/hifi/operators/op_remainder.cpp create mode 100644 backends/cadence/hifi/operators/quantized_matmul_out.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml deleted file mode 100644 index f79d5f870d..0000000000 --- a/backends/cadence/aot/functions.yaml +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This yaml file contains operators that are also defined by the ATen library. -# For lean mode: -# - Codegen'd target `executorch_generated_lib` will be reading all the information -# from this file, including operator schema and kernel metadata. -# - Selective build target `codegen:executorch_defined_ops` now is selecting all the -# operators in this file, by dumping all the op names into `selected_operators.yaml`. -# -# See the README.md file in executorch/kernels/portable for a description of the syntax used -# by this file. - - -# aten ops -- op: _to_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::to_copy_out - -- op: _softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::softmax_out - -- op: add.out - kernels: - - arg_meta: null - kernel_name: torch::executor::add_out - -- op: bmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bmm_out - -- op: cat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cat_out - -- op: clone.out - kernels: - - arg_meta: null - kernel_name: torch::executor::clone_out - -- op: div.out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out - -- op: div.out_mode - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out_mode - -- op: embedding.out - kernels: - - arg_meta: null - kernel_name: torch::executor::embedding_out - -- op: full.out - kernels: - - arg_meta: null - kernel_name: torch::executor::full_out - -- op: mul.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_out - -- op: permute_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::permute_copy_out - -- op: sigmoid.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sigmoid_out - -- op: slice_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_copy_Tensor_out - -- op: split_with_sizes_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::split_with_sizes_copy_out - -- op: sub.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_out - -- op: view_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::view_copy_out - -- op: where.self_out - kernels: - - arg_meta: null - kernel_name: torch::executor::where_out - -# custom ops -- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: impl::reference::quantize_per_tensor_out - -- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: impl::reference::dequantize_per_tensor_out - -- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_conv_out - -- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_layer_norm_out - -- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_linear_out - -- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_relu_out diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 84205afe32..47744f6592 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -56,6 +56,61 @@ kernels: - arg_meta: null kernel_name: torch::executor::div_out_mode + +- op: floor_divide.out + kernels: + - arg_meta: null + kernel_name: torch::executor::floor_divide_out + +- op: remainder.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::remainder_Tensor_out + +- op: remainder.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::remainder_Scalar_out + +- op: fmod.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::fmod_Tensor_out + +- op: fmod.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::fmod_Scalar_out + +- op: bitwise_and.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_and_Scalar_out + +- op: bitwise_and.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_and_Tensor_out + +- op: bitwise_or.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_or_Scalar_out + +- op: bitwise_or.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_or_Tensor_out + +- op: bitwise_xor.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_xor_Scalar_out + +- op: bitwise_xor.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_xor_Tensor_out - op: embedding.out kernels: @@ -82,6 +137,11 @@ - arg_meta: null kernel_name: torch::executor::mul_out +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_scalar_out + - op: permute_copy.out kernels: - arg_meta: null @@ -117,10 +177,105 @@ - arg_meta: null kernel_name: torch::executor::where_out +- op: scalar_tensor.out + kernels: + - arg_meta: null + kernel_name: torch::executor::scalar_tensor_out + - op: rsqrt.out kernels: - arg_meta: null - kernel_name: torch::executor::rsqrt_out + kernel_name: torch::executor::rsqrt_out + +- op: ge.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ge_scalar_out + +- op: ge.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ge_tensor_out + +- op: gt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_scalar_out + +- op: gt.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_tensor_out + +- op: le.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::le_scalar_out + +- op: le.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::le_tensor_out + +- op: lt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::lt_scalar_out + +- op: lt.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::lt_tensor_out + +- op: eq.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::eq_scalar_out + +- op: eq.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::eq_tensor_out + +- op: ne.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ne_scalar_out + +- op: ne.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ne_tensor_out + +- op: pow.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Scalar_out + +- op: pow.Tensor_Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Tensor_Scalar_out + +- op: pow.Tensor_Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Tensor_Tensor_out + +- op: atan2.out + kernels: + - arg_meta: null + kernel_name: torch::executor::atan2_out + +- op: empty.out + kernels: + - arg_meta: null + kernel_name: torch::executor::empty_out + +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::gelu_out - op: empty.out kernels: @@ -154,6 +309,21 @@ kernels: - arg_meta: null kernel_name: impl::HiFi::quantized_linear_out + +- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_matmul_out + +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_relu_out + +- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv_out - func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index afec8fb3e0..93709049db 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -18,6 +18,14 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_clamp_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ) target_include_directories( diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 7816cf6588..a579135e5d 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,12 +15,105 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" -extern "C" WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out, /* pointer to write broadcasted output data to */ - const int *const out_shape, /* output shape resulting after broadcast */ +/* new functions in nnlib */ +extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char *__restrict__ p_condition, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + const unsigned char *__restrict__ p_condition, + const WORD32 *const p_condition_shape + ); + +extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool( + WORD8 * __restrict__ p_out, + const WORD8 * __restrict__ p_inp1, + const WORD8 * __restrict__ p_inp2, + WORD32 num_elm); - WORD32* __restrict__ p_in, /* pointer to unextended input data */ - const int * const in_shape, /* input shape */ - int num_dims); +extern "C" WORD32 xa_nn_elm_remainder_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_fmod_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_floor_div_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_floor_div_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_broadcast_32_32( + WORD32* __restrict__ p_out, + const int *const out_shape, + WORD32* __restrict__ p_in, + const int * const in_shape, + int num_dims); + +extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32( + WORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + WORD32 kernel_type); + +extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( + WORD8 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type); + +extern "C" WORD32 xa_nn_concat_32_32( + WORD32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const WORD32 **pp_inps, + const WORD32 *const *pp_inps_shape, + WORD32 num_out_dims, + WORD32 num_inp, + WORD32 num_inp_dims, + WORD32 axis); extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_inp, @@ -48,12 +141,6 @@ extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_inp2, WORD32 num_elm); -extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - const unsigned char *__restrict__ p_condition, - WORD32 num_elm); - extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, const WORD32 *const p_out_shape, const FLOAT32 * __restrict__ p_inp1, @@ -68,6 +155,7 @@ extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restric const FLOAT32 * __restrict__ p_inp2, const WORD32 *const p_inp2_shape); + extern "C" WORD32 xa_nn_elm_floor_div_f32xf32_f32( FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_inp1, @@ -101,7 +189,7 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict_ const WORD32 *const p_condition_shape ); -/* new functions in nnlib */ + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, const WORD32 *const p_out_shape, const FLOAT32 * __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 4056958e3d..d0a357f640 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -38,14 +38,31 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_floor_divide.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_fmod.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_lt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_le.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_gt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ge.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_eq.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_remainder.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_or.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_xor.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_scalar_tensor.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" @@ -55,9 +72,10 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp") - + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp") + add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -73,8 +91,8 @@ target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. add_library( custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" - "quantized_conv_out.cpp" "quantized_relu_out.cpp") + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" "quantized_matmul_out.cpp" "quantized_relu_out.cpp" "quantized_conv_out.cpp") + target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories}) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp new file mode 100644 index 0000000000..5524b02510 --- /dev/null +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include"kernels.h" + +#define NNLIB_OPT 0 + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& +atan2_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + +#if NNLIB_OPT + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_is_broadcasted = (a_is_broadcasted && b_is_broadcasted); + + WORD32 num_elm = out.numel(); + + if(both_is_broadcasted) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + val = xa_nn_broadcast_32_32(ptr2, + p_out_shape, + pin2, + p_inp2_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr2; + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + free(ptr2); + } + else if(a_is_broadcasted && (!b_is_broadcasted)) + { + FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__ )malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + WORD32 val = xa_nn_broadcast_32_32((WORD32 *)ptr1, + p_out_shape, + (WORD32 *)pin1, + p_inp1_shape, + 4); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm); + + free(ptr1); + } + else if(b_is_broadcasted && (!a_is_broadcasted)) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr1; + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + } +#else + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "atan2.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "atan2.out", CTYPE_B, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, "atan2.out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_OUT casted_a = static_cast(val_a); + CTYPE_OUT casted_b = static_cast(val_b); + return static_cast(std::atan2(casted_a, casted_b)); + }, + a, + b, + out); + }); + }); + }); + +#endif + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_and.cpp b/backends/cadence/hifi/operators/op_bitwise_and.cpp new file mode 100644 index 0000000000..bb71e8843b --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_and.cpp @@ -0,0 +1,310 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_and_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_and.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_and.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_and.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_and, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_and_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_and.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_and.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_and.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_and.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::bit_and()( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_or.cpp b/backends/cadence/hifi/operators/op_bitwise_or.cpp new file mode 100644 index 0000000000..33e378a9b2 --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_or.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_or_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_or.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_or.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_or.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_or, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_or_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_or.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_or.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_or.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_or.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = + std::bit_or()(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_xor.cpp b/backends/cadence/hifi/operators/op_bitwise_xor.cpp new file mode 100644 index 0000000000..e051f0d0fd --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_xor.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_xor_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_xor.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_xor.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_xor.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_xor, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_xor_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_xor.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_xor.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_xor.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_xor.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::bit_xor()( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp new file mode 100644 index 0000000000..39304ce46b --- /dev/null +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& cat_out( + RuntimeContext& ctx, + exec_aten::ArrayRef tensors, + int64_t dim, + Tensor& out) { + + if(out.scalar_type() == ScalarType::Float){ + WORD32 num_inp = tensors.size(); + WORD32 num_inp_dims = out.dim(); + WORD32 num_out_dims = num_inp_dims; + WORD32 axis = dim; + + WORD32 inp_shape[16][16]; + WORD32 p_out_shape[16] = {0}; + + WORD32 *ptr_shape[16]; + const WORD32 *ptr[16]; + + int k = 0; + for(int i = 0; i < num_inp; i++) + { + if(tensors[i].numel() == 0) + continue; + ptr[k] = (const WORD32 *)tensors[i].const_data_ptr(); + for(int j = 0; j < num_inp_dims; j++) + { + inp_shape[k][j] = tensors[i].size(j); + } + ptr_shape[k] = inp_shape[k]; + k++; + } + + num_inp = k; + + for(int i = 0; i < num_out_dims; i++) + { + p_out_shape[i] = out.size(i); + } + + const WORD32 **pp_inps = &ptr[0]; + + WORD32 * p_out = (WORD32 *)out.mutable_data_ptr(); + + const WORD32 *const *pp_inps_shape = (const WORD32 *const *)&ptr_shape[0]; + + WORD32 val = xa_nn_concat_32_32(p_out + ,p_out_shape + ,pp_inps + ,pp_inps_shape + ,num_out_dims + ,num_inp + ,num_inp_dims + ,axis); + + return out; + } + else { + + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + ET_CHECK( + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok); + + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } + + const size_t outer = getLeadingDims(out, dim); + const size_t dim_stride = getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "cat", CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "cat", CTYPE_IN, [&] { + if (tensors[j].numel() == 0) { + return; + } + size_t inner = tensors[j].size(dim) * dim_stride; + + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + } + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_eq.cpp b/backends/cadence/hifi/operators/op_eq.cpp new file mode 100644 index 0000000000..8b66dc85a3 --- /dev/null +++ b/backends/cadence/hifi/operators/op_eq.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& eq_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 4); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 4); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "eq.Scalar_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted == b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& eq_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "eq.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted == b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_floor_divide.cpp b/backends/cadence/hifi/operators/op_floor_divide.cpp new file mode 100644 index 0000000000..0514df0ca2 --- /dev/null +++ b/backends/cadence/hifi/operators/op_floor_divide.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::floor_divide(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& floor_divide_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + auto div_by_zero_error = false; + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "floor_divide.out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "floor_divide.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() { + FloorDivideInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); + }); + }); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !div_by_zero_error, + InvalidArgument, + out, + "Floor divide operation encountered integer division by zero"); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_fmod.cpp b/backends/cadence/hifi/operators/op_fmod.cpp new file mode 100644 index 0000000000..a665cda0e0 --- /dev/null +++ b/backends/cadence/hifi/operators/op_fmod.cpp @@ -0,0 +1,262 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::fmod(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& fmod_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_fmod_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_fmod_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + auto div_by_zero_error = false; + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "fmod.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "fmod.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() { + FmodInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); + }); + }); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !div_by_zero_error, + InvalidArgument, + out, + "Fmod operation encountered integer division by zero"); + } + + return out; +} + +Tensor& fmod_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + // Check for integer division by zero + if (isIntegralType(common_type, /*includeBool=*/true)) { + auto is_zero = false; + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "fmod.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + is_zero = (val_b == 0); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !is_zero, + InvalidArgument, + out, + "Fmod operation encountered integer division by zero"); + } + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "fmod.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "fmod.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_REAL_TYPES( + common_type, ctx, "fmod.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES( + out_type, ctx, "fmod.Scalar_out", CTYPE_OUT, [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::fmod(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_ge.cpp b/backends/cadence/hifi/operators/op_ge.cpp new file mode 100644 index 0000000000..f81b981442 --- /dev/null +++ b/backends/cadence/hifi/operators/op_ge.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& ge_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 0); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 0); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ge.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "ge.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ge.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted >= b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& ge_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ge.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "ge.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "ge.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ge.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted >= b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_gt.cpp b/backends/cadence/hifi/operators/op_gt.cpp new file mode 100644 index 0000000000..4f0d6aec32 --- /dev/null +++ b/backends/cadence/hifi/operators/op_gt.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& gt_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 1); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 1); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "gt.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "gt.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "gt.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted > b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& gt_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "gt.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "gt.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "gt.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "gt.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted > b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_le.cpp b/backends/cadence/hifi/operators/op_le.cpp new file mode 100644 index 0000000000..70673399bb --- /dev/null +++ b/backends/cadence/hifi/operators/op_le.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& le_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 2); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 2); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted <= b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& le_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "le.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "le.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "le.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted <= b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_lt.cpp b/backends/cadence/hifi/operators/op_lt.cpp new file mode 100644 index 0000000000..315e145795 --- /dev/null +++ b/backends/cadence/hifi/operators/op_lt.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& lt_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 3); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 3); + } + } + else + { + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "lt.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "lt.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "lt.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted < b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& lt_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "lt.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "lt.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "lt.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "lt.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted < b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_ne.cpp b/backends/cadence/hifi/operators/op_ne.cpp new file mode 100644 index 0000000000..8a00211f6c --- /dev/null +++ b/backends/cadence/hifi/operators/op_ne.cpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& ne_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 5); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 5); + } + } + else + { + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ne.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "ne.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ne.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted != b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& ne_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ne.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "ne.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ne.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted != b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp new file mode 100644 index 0000000000..fee28706f5 --- /dev/null +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -0,0 +1,380 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include"kernels.h" + +#define NNLIB_OPT 0 + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& pow_Tensor_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { +#if NNLIB_OPT + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_is_broadcasted = (a_is_broadcasted && b_is_broadcasted); + + WORD32 num_elm = out.numel(); + + if(both_is_broadcasted) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + val = xa_nn_broadcast_32_32(ptr2, + p_out_shape, + pin2, + p_inp2_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr2; + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + free(ptr2); + } + else if(a_is_broadcasted && (!b_is_broadcasted)) + { + FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__ )malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + WORD32 val = xa_nn_broadcast_32_32((WORD32 *)ptr1, + p_out_shape, + (WORD32 *)pin1, + p_inp1_shape, + 4); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm); + + free(ptr1); + } + else if(b_is_broadcasted && (!a_is_broadcasted)) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr1; + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + } +#else + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK( + ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "pow.Tensor_Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES( + b_type, ctx, "pow.Tensor_Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Tensor_Tensor_out", CTYPE_OUT, [&]() { + PowInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); +#endif + return out; +} + +Tensor& pow_Tensor_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = + utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "pow.Tensor_Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "pow.Tensor_Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES( + common_type, ctx, "pow.Tensor_Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Tensor_Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +Tensor& pow_Scalar_out( + RuntimeContext& ctx, + const Scalar& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, b.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = utils::get_scalar_dtype(a); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = + utils::promote_type_with_scalar(b_type, a, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, "pow.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "pow.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, "pow.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_A val_a = 0; + utils::extract_scalar(a, &val_a); + + apply_unary_map_fn( + [val_a](const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + b.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp new file mode 100644 index 0000000000..c1fe48fc7d --- /dev/null +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::remainder_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner + : public ReportCanCastBug {}; + +} // namespace +Tensor& remainder_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_remainder_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { + RemainderInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& remainder_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "remainder.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_REAL_TYPES( + common_type, ctx, "remainder.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES( + out_type, + ctx, + "remainder.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = utils::remainder_override( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp index e5e60bf783..4cf295f98a 100644 --- a/backends/cadence/hifi/operators/op_rsqrt.cpp +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -35,13 +35,10 @@ Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { p_out, p_inp, num_elm); - - return out; - } + return out; + } else - { return internal::unary_ufunc_realhb_to_floath(rsqrt, ctx, in, out); - } } diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 64c7fa42e2..d08f16a2b0 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -45,107 +45,122 @@ Tensor& where_out( "Unhandled dtype %s for where.self_out", torch::executor::toString(cond_type)); - /*logic to find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int cond_is_broadcasted = !out.sizes().equals(cond.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted); + /*logic to find broadcast*/ + const int a_is_broadcasted = !out.sizes().equals(a.sizes()); + const int b_is_broadcasted = !out.sizes().equals(b.sizes()); + const int cond_is_broadcasted = !out.sizes().equals(cond.sizes()); + const int broadcast = (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = cond.dim() > max_dim ? cond.dim() : max_dim; + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool check = 0; + + for(int i = 0; i < max_dim; i++) + { + if(cond.size(i) > b.size(i)) + check = 1; - int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); - max_dim = cond.dim() > max_dim ? cond.dim() : max_dim; - max_dim = out.dim() > max_dim ? out.dim() : max_dim; - bool fall_back = 0; - if((a_type != ScalarType::Float) || (b_type != ScalarType::Float) || (cond_type != ScalarType::Float)) - fall_back = 1; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; - -if(!fall_back) -{ - const float* a_data = a.const_data_ptr(); - const float* b_data = b.const_data_ptr(); - float* out_data = out.mutable_data_ptr(); - const unsigned char* con = cond.const_data_ptr(); - - if(broadcast == 1) + if(check == 1) + break; + } + + bool fall_back = 0; + if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + fall_back = 1; + + if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) + fall_back = 1; + + if((!fall_back) && (!check)) + { + const float* a_data = a.const_data_ptr(); + const float* b_data = b.const_data_ptr(); + float* out_data = out.mutable_data_ptr(); + const unsigned char* con = cond.const_data_ptr(); + + if(broadcast == 1) + { + int out_shape[NNLIB_MAX_DIM]; + int inp1_shape[NNLIB_MAX_DIM]; + int inp2_shape[NNLIB_MAX_DIM]; + int con_shape[NNLIB_MAX_DIM]; + + for(int i = 0; i < NNLIB_MAX_DIM; i++) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; - int con_shape[NNLIB_MAX_DIM]; - - for(int i = 0; i < NNLIB_MAX_DIM; i++) - { - con_shape[i] = 1; - out_shape[i] = 1; - inp1_shape[i] = 1; - inp2_shape[i] = 1; - } - - int off_o = NNLIB_MAX_DIM - out.dim(); - int off_a = NNLIB_MAX_DIM - a.dim(); - int off_b = NNLIB_MAX_DIM - b.dim(); - int off_c = NNLIB_MAX_DIM - cond.dim(); - - for(int i = 0; i < out.dim(); i++){ - out_shape[i+off_o] = out.size(i);} - - for(int i = 0; i < a.dim(); i++) - inp1_shape[i+off_a] = a.size(i); - for(int i = 0; i < b.dim(); i++) - inp2_shape[i+off_b] = b.size(i); - for(int i = 0; i < cond.dim(); i++) - con_shape[i+off_c] = cond.size(i); - - /* Add fallback if broadcast and condition dimension are larger than inputs dimension, this code doesn't support that*/ - - if(con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) - { - void* p_scratch = malloc(out_shape[0]*out_shape[1]*out_shape[2]*out_shape[3]); - const unsigned char *p_brd_cond = (const unsigned char*)p_scratch; - xa_nn_broadcast_8_8((WORD8* __restrict__) p_brd_cond, out_shape, (const WORD8* __restrict__) con, con_shape, 4); + con_shape[i] = 1; + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = NNLIB_MAX_DIM - out.dim(); + int off_a = NNLIB_MAX_DIM - a.dim(); + int off_b = NNLIB_MAX_DIM - b.dim(); + int off_c = NNLIB_MAX_DIM - cond.dim(); + + for(int i = 0; i < out.dim(); i++) + out_shape[i+off_o] = out.size(i); + + for(int i = 0; i < a.dim(); i++) + inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + inp2_shape[i+off_b] = b.size(i); + for(int i = 0; i < cond.dim(); i++) + con_shape[i+off_c] = cond.size(i); + + /* Add fallback if broadcast and condition dimension are larger than inputs dimension, this code doesn't support that*/ + + if(con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) + { + void* p_scratch = malloc(out_shape[0]*out_shape[1]*out_shape[2]*out_shape[3]); + const unsigned char *p_brd_cond = (const unsigned char*)p_scratch; + xa_nn_broadcast_8_8((WORD8* __restrict__) p_brd_cond, out_shape, (const WORD8* __restrict__) con, con_shape, 4); + + for(int i = 0; i < 4; i++) + { + con_shape[i] = out_shape[i]; + } + xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, + b_data, inp2_shape, p_brd_cond, con_shape); + free(p_scratch); - for(int i = 0; i < 4; i++) - { - con_shape[i] = out_shape[i]; - } - xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, - b_data, inp2_shape, p_brd_cond, con_shape); - free(p_scratch); - } - else - { - xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, con, con_shape); - } } else { - xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel()); + xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, con, con_shape); } -} -else -{ - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - using CTYPE_OUT = - typename torch::executor::promote_types::type; - apply_ternary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b, const uint8_t val_c) { - CTYPE_OUT a_casted = static_cast(val_a); - CTYPE_OUT b_casted = static_cast(val_b); - return val_c ? a_casted : b_casted; - }, - a, - b, - cond, - out); + } + else + { + xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel()); + } + } + else + { + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_OUT = + typename torch::executor::promote_types::type; + apply_ternary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b, const uint8_t val_c) { + CTYPE_OUT a_casted = static_cast(val_a); + CTYPE_OUT b_casted = static_cast(val_b); + return val_c ? a_casted : b_casted; + }, + a, + b, + cond, + out); + }); }); - }); -} + } return out; } } // namespace native } // namespace executor } // namespace torch + diff --git a/backends/cadence/hifi/operators/quantized_conv_out.cpp b/backends/cadence/hifi/operators/quantized_conv_out.cpp index 8b4c37a215..6706374708 100644 --- a/backends/cadence/hifi/operators/quantized_conv_out.cpp +++ b/backends/cadence/hifi/operators/quantized_conv_out.cpp @@ -12,12 +12,18 @@ #include #include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x))+(bytes-1))&(~(bytes-1))) +#define NNLIB_OPT 0 + namespace impl { namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; using RuntimeContext = torch::executor::RuntimeContext; +using ScalarType = exec_aten::ScalarType; + // This implements a generic 2d conv kernel that operates on raw pointers. // The version handles both quantized and fp32 convolutions. @@ -108,7 +114,11 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( int woff = _wh * ww + _ww; float lhs = in_plane[ioff] - in_zero_point; float rhs = weight_plane[woff] - - (quantized ? weight_zero_point[0] : 0); + + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + acc += lhs * rhs; } } @@ -122,13 +132,19 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( if (((_h + d0 * _wh - p0) >= 0) && ((_h + d0 * _wh - p0) < h) && ((_w + d1 * _ww - p1) >= 0) && - ((_w + d1 * _ww - p1 < w))) { + ((_w + d1 * _ww - p1) < w)) { + //((_w + d1 * _ww - p1 < w))) { + int ioff = (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); int woff = _wh * ww + _ww; float lhs = in_plane[ioff] - in_zero_point; float rhs = weight_plane[woff] - - (quantized ? weight_zero_point[0] : 0); + + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + acc += lhs * rhs; } } @@ -174,6 +190,365 @@ void quantized_conv_out( bool channel_last, Tensor& out) { bool conv1d = input.dim() == 3; + +#if NNLIB_OPT + + if(input.scalar_type() == ScalarType::Char) + { + WORD8* __restrict__ p_out = (WORD8* __restrict__ )out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = (WORD8* __restrict__ )input.const_data_ptr(); + WORD8* __restrict__ p_kernel = (WORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = 8; + WORD32 kernel_precision = 8; + pVOID p_scratch = nullptr; + WORD8 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + WORD32 inp_data_format = 0; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_inp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_kernel + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:scratch_size; + + ptr_scratch = (WORD8 *)malloc(scratch_size + 16); + + p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; ++_n) { + WORD8 *in_batch = pin + _n * input_channels * input_height * input_width; + WORD8 *out_batch = p_out + _n * out_channels * out_height * out_width; + + WORD32 val = xa_nn_conv2d_per_chan_sym8sxasym8s + (out_batch + ,in_batch + ,pkernel + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else if(input.scalar_type() == ScalarType::Byte) + { + printf("UINT8 CONV KERNEL"); + UWORD8* __restrict__ p_out = (UWORD8* __restrict__ )out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = (UWORD8* __restrict__ )input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = (UWORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = 8; + WORD32 kernel_precision = 8; + pVOID p_scratch = nullptr; + WORD8 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + WORD32 inp_data_format = 0; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_channels; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD8 * p_tmp = (WORD8 *)p_inp; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_tmp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD8 * p_tmp1 = (WORD8 *)p_kernel; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_tmp1 + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:(scratch_size); + + ptr_scratch = (WORD8 *)malloc(scratch_size + 16); + + p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8); + + const UWORD8* __restrict__ p_inp1 = (const UWORD8* __restrict__ )pin; + const UWORD8* __restrict__ p_kernel1 = (const UWORD8* __restrict__ )pkernel; + + for (int _n = 0; _n < batches; _n++) { + const UWORD8* __restrict__ in_batch = p_inp1 + _n * input_channels * input_height * input_width; + UWORD8* __restrict__ out_batch = p_out + _n * out_channels * out_height * out_width; + + WORD32 val = xa_nn_conv2d_per_chan_asym8xasym8 + (out_batch + ,in_batch + ,p_kernel1 + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else + { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type()); + } + +#else // input = [n, c, h, w] const int n = input.size(0); const int c = input.size(1); @@ -192,36 +567,76 @@ void quantized_conv_out( // per-channel bool per_tensor_quantized = bias_scale.numel() == 1; - conv2d_nchw_core_generic( - input.const_data_ptr(), - weight.const_data_ptr(), - bias.const_data_ptr(), - out.mutable_data_ptr(), - n, - c, - h, - w, - oc, - wc, - wh, - ww, - oh, - ow, - stride[0], - stride[1], - padding[0], - padding[1], - dilation[0], - dilation[1], - groups, - in_zero_point, - weight_zero_point.const_data_ptr(), - bias_scale.const_data_ptr(), - output_scale, - (uint8_t)output_zero_point, - per_tensor_quantized); + if(input.scalar_type() == ScalarType::Char) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (int8_t)output_zero_point, + per_tensor_quantized); + + } + else if(input.scalar_type() == ScalarType::Byte) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (uint8_t)output_zero_point, + per_tensor_quantized); + } + else + { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type()); + } +#endif } }; // namespace native -}; // namespace reference +}; // namespace HiFi }; // namespace impl diff --git a/backends/cadence/hifi/operators/quantized_matmul_out.cpp b/backends/cadence/hifi/operators/quantized_matmul_out.cpp new file mode 100644 index 0000000000..a22cf700d7 --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_matmul_out.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "kernels.h" + +namespace impl { +namespace HiFi { +namespace native { + +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; + +// The quantized matmul. The quantized matmul accumulates in a wider register, +// whose type is TA. +template < + typename TZ, + typename TA = float, + bool transposed = false, + typename TX = TZ, + typename TY = TZ> +__attribute__((noinline)) void qmatmul( + TZ* __restrict__ Z, + int32_t Z_multiplier, + int32_t Z_shift, + int32_t Z_zero_point, + const TX* __restrict__ X, + int32_t X_zero_point, + const TY* __restrict__ y, + int32_t Y_zero_point, + size_t m, + size_t n, + size_t p) { + // Compute the Z_scale from Z_multiplier and Z_shift + const float Z_scale = -Z_multiplier * 1.0 / (1 << 31) * pow(2, Z_shift); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < p; ++j) { + TA sum = 0; + for (size_t k = 0; k < n; ++k) { + if (transposed) { + sum += (X[i * n + k] - X_zero_point) * (y[j * n + k] - Y_zero_point); + } else { + sum += (X[i * n + k] - X_zero_point) * (y[k * p + j] - Y_zero_point); + } + } + Z[i * p + j] = kernels::quantize(sum, Z_scale, Z_zero_point); + } + } +} + +template +void inline _typed_quantized_matmul( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const exec_aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + T* __restrict__ out_data = out.mutable_data_ptr(); + const T* __restrict__ X_data = X.const_data_ptr(); + const T* __restrict__ Y_data = Y.const_data_ptr(); + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + for (size_t i = 0; i < batch_size; ++i) { + const T* x = X_data + i * leading_dim * in_dim; + const T* y = Y_data + i * in_dim * out_dim; + T* z = out_data + i * leading_dim * out_dim; + if (transposed) { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } else { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } + } +} + +void quantized_matmul_out( + RuntimeContext& ctx, + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const exec_aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + + /*printf("transposed = %d\t", transposed); + printf("m = %d\t", leading_dim); + printf("n = %d\t", in_dim); + printf("p = %d\t", out_dim);*/ + + if (out.scalar_type() == exec_aten::ScalarType::Byte) { + //printf("Byte\n"); + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + + /*uint8_t* __restrict__ out_data = out.mutable_data_ptr(); + const uint8_t* __restrict__ X_data = X.const_data_ptr(); + const uint8_t* __restrict__ Y_data = Y.const_data_ptr(); + int bias_tmp[64] = {0}; + const int32_t* __restrict__ bias_data = bias_tmp;//bias.value().const_data_ptr(); + + xa_nn_matmul_asym8uxasym8u_asym8u( + out_data, // p_out + Y_data, // p_mat1, + X_data, // p_mat2, + bias_data, // p_bias + out_dim, // rows of p_mat1 + in_dim, // cols of p_mat1 + in_dim, // row_stride of p_mat1 + leading_dim, // vec_count, i.e., rows of p_mat2 + in_dim, // vec_offset of p_mat2. + out_dim, // out_offset, i.e., offset of next output element written + 1, // out_stride, i.e., stride to go to next output row + -(int32_t)Y_zero_point, // mat1_zero_bias + -(int32_t)X_zero_point, // mat2_zero_bias + (int32_t)out_multiplier, // out_multiplier + (int32_t)out_shift, // out_shift + (int32_t)out_zero_point); // out_zero_bias*/ + + } else if (out.scalar_type() == exec_aten::ScalarType::Char) { + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp index 67567104fa..1643747bae 100644 --- a/backends/cadence/hifi/operators/quantized_relu_out.cpp +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -47,5 +47,5 @@ void quantized_relu_out( } }; // namespace native -}; // namespace reference +}; // namespace HiFi }; // namespace impl diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 0000000000..cad3f1a25b --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c new file mode 100644 index 0000000000..244f404d2e --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c @@ -0,0 +1,172 @@ +#include "xa_type_def.h" +#include "xa_nn_common.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_common.h" + +WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 **pp_inps + ,const WORD32 *const *pp_inps_shape + ,WORD32 num_out_dims + ,WORD32 num_inp + ,WORD32 num_inp_dims + ,WORD32 axis) +{ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); + //Validate Arguments + XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); + XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); + XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); + XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); + + int i = 0, j = 0; + for(i = 0; i < num_out_dims; i++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); + } + + if(axis < 0) + axis = num_out_dims + axis; + + WORD32 concat_size = 0; + for (i = 0; i < num_inp; i++) + { + XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); +#pragma loop_count min=1 + for(j = 0; j < num_out_dims; j++) + { + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); + } + + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); + concat_size += pp_inps_shape[i][axis]; + } + + XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); + + //Calculate outer and inner size for axis + WORD32 outer_size = 1; +#pragma no_simd + for(int i = 0; i < axis; i++) + { + outer_size *= p_out_shape[i]; + } + + WORD32 base_inner_size = 1; +#pragma no_simd + for(int i = axis + 1; i < num_out_dims; i++) + { + base_inner_size *= p_out_shape[i]; + } + + WORD32 *ptmp_out = p_out; + for(int i = 0; i < num_inp; i++) + { + const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; + WORD32 *output_ptr = ptmp_out; + const WORD32* input_ptr = pp_inps[i]; + + if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) + && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) + { + if(copy_size <= 8) + { + const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; + for(int k = 0; k < outer_size; k++) + { + ae_f32 *pae_out = (ae_f32 *)output_ptr; +#pragma concurrent +#pragma no_simd + for(int ic = 0; ic < copy_size; ic++) + { + *pae_out++ = *pae_inp++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + for(int ic = 0; ic < (copy_size >> 1); ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; + ae_f32 *puae_out = (ae_f32 *)pae_out; +#pragma concurrent + for(int ic = 0; ic < (copy_size & 1); ic++) + { + puae_out[copy_size - 1] = puae_inp[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + else + { + if(copy_size <= 6) + { + for(int k = 0; k < outer_size; k++) + { +#pragma concurrent +#pragma no_unroll + for(int ic = 0; ic < copy_size; ic++) + { + output_ptr[ic] = *input_ptr++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + +#pragma concurrent + for(int ic = 0; ic < copy_size >> 1; ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + + for(int ic = 0; ic < (copy_size & 1); ic++) + { + output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + ptmp_out += copy_size; + } + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c new file mode 100644 index 0000000000..139a97ec3f --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c @@ -0,0 +1,525 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_fmod_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_fmod_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_DIV_S(a1, a2); + a = FITRUNC_S(a); + a = XT_MUL_S(a, a2); + a = XT_SUB_S(a1, a); + XT_SSI(a, (xtfloat *)out, 0); + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_fmod_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x2, x1); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x2, x1); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(b0, a0); + c0 = FITRUNC_S(c0); + c0 = XT_MUL_S(c0, a0); + c0 = XT_SUB_S(b0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(a0, b0); + c0 = FITRUNC_S(c0); + c0 = XT_MUL_S(c0, b0); + c0 = XT_SUB_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_fmod_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_fmod_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_fmod_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_fmod_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_fmod_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c new file mode 100644 index 0000000000..752a25b682 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c @@ -0,0 +1,52 @@ +#include "xa_nnlib_common.h" + +WORD32 xa_nn_elm_logicalxor_boolxbool_bool(WORD8 * __restrict__ p_out, + const WORD8 * __restrict__ p_inp1, + const WORD8 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD8), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + ae_int24x2 *pin1 = (ae_int24x2 *)p_inp1; + ae_int24x2 *pin2 = (ae_int24x2 *)p_inp2; + ae_int24x2 *pout = (ae_int24x2 *)p_out; + int i; + int N = num_elm; + /* Following line divides N by 6. Much faster than compiler implementation. Works for N<32768. */ + /* unsigned int Nby6 = (N*10923)>>16;*/ + /* Following works for all int32 N */ + int Nby6 = AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(N, 0x2AAAAAAB))); + int remainder_start = 6*Nby6; + + ae_valign align_src_in1, align_src_in2, align_dst; + align_src_in1 = AE_LA64_PP(pin1); + align_src_in2 = AE_LA64_PP(pin2); + align_dst = AE_ZALIGN64(); + +/* Loop is unrolled by 6, to use LA24X2/SA24X2 */ + for(i=0; i < Nby6; i++){ + ae_int24x2 vi1, vi2, vo; + AE_LA24X2_IP(vi1, align_src_in1, pin1); + AE_LA24X2_IP(vi2, align_src_in2, pin2); + vo = AE_XOR24(vi1, vi2); + AE_SA24X2_IP(vo, align_dst, pout); + } + AE_SA64POS_FP(align_dst, pout); + + /* Remainder loop */ + #pragma no_unroll + for(i=remainder_start; i < N; i++){ + p_out[i] = p_inp1[i] & p_inp2[i]; + } + + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c new file mode 100644 index 0000000000..3b40752211 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c @@ -0,0 +1,525 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_remainder_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_remainder_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_DIV_S(a1, a2); + a = FIFLOOR_S(a); + a = XT_MUL_S(a, a2); + a = XT_SUB_S(a1, a); + XT_SSI(a, (xtfloat *)out, 0); + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_remainder_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(b0, a0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, a0); + c0 = XT_SUB_S(b0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(a0, b0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, b0); + c0 = XT_SUB_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_remainder_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c new file mode 100644 index 0000000000..2372fcadcd --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c @@ -0,0 +1,2029 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_greater_lesser_equal_f32xf32_f32, + ( + WORD8 *y, + const FLOAT32 *x1, + const FLOAT32 *x2, + WORD32 N, + WORD32 kernel_type + ) + ) +#else +WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + WORD32 kernel_type) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + //xtfloatx2 *out = (xtfloatx2 *)p_out; + UWORD8 *out = p_out; + xtfloatx2 x1, x2, y; + xtbool check; + + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + if(kernel_type == 0) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a1, a2); + + check = 0; + if(a <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a1, a2); + + check = 0; + if(a < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + //a = XT_SUB_S(a2, a1); + + check = 0; + if(a1 == a2) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *out++ = AE_MOVAD32_H(store); + *out++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *out++ = AE_MOVAD32_H(store); + *out++ = AE_MOVAD32_L(store); + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag, + WORD32 kernel_type) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + + xtbool check; + + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; + + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + //c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(a0 == b0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; + + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if (kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + //c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(a0 == b0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } + } +} + +static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag, + WORD32 kernel_type) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + + xtbool check; + + UWORD8 * p_c = p_out; + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out == 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out == 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } +} +#endif + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_greaterequal_broadcast_4D_f32xf32_f32, + ( + WORD8 * p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type + ) + ) +#else +WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(WORD8 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1); + + /* Check shapes */ + int i; + xtbool sign_flag; + for(i = 0; i < 4; i++) + { + if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) || + (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + UWORD8 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag, + kernel_type); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag, + kernel_type); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag, + kernel_type); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag, + kernel_type); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif