Skip to content

Commit

Permalink
Rushikesh main (#15)
Browse files Browse the repository at this point in the history
* Adding mean operator kernel optimization

* Merge from main

* Adding compare operators kernel optimization

* Code cleanup

---------

Co-authored-by: dijopaul <[email protected]>
  • Loading branch information
Rushi-cad and dijopaul authored Oct 24, 2024
1 parent 71a6da9 commit fe91c10
Show file tree
Hide file tree
Showing 11 changed files with 3,205 additions and 13 deletions.
66 changes: 63 additions & 3 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,56 @@
kernels:
- arg_meta: null
kernel_name: torch::executor::embedding_out

- op: eq.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::eq_scalar_out

- op: eq.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::eq_tensor_out

- op: ge.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::ge_scalar_out

- op: ge.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::ge_tensor_out

- op: gt.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::gt_scalar_out

- op: gt.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::gt_tensor_out

- op: le.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::le_scalar_out

- op: le.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::le_tensor_out

- op: lt.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::lt_scalar_out

- op: lt.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::lt_tensor_out

- op: full.out
kernels:
Expand All @@ -80,7 +130,17 @@
- op: mean.out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::mean_dim_out
kernel_name: impl::HiFi::mean_dim_out

- op: ne.Scalar_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::ne_scalar_out

- op: ne.Tensor_out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::ne_tensor_out

- op: minimum.out
kernels:
Expand All @@ -90,7 +150,7 @@
- op: mm.out
kernels:
- arg_meta: null
kernel_name: impl::HiFi::mm_out
kernel_name: impl::HiFi::mm_out

- op: mul.out
kernels:
Expand Down Expand Up @@ -136,7 +196,7 @@
kernels:
- arg_meta: null
kernel_name: impl::HiFi::where_out

# custom ops
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
variants: function
Expand Down
1 change: 1 addition & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
)
Expand Down
35 changes: 25 additions & 10 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" void xa_nn_elm_atan2_f32(FLOAT32 * z,
const FLOAT32 * y,
const FLOAT32 * x,
WORD32 N );
extern "C" void xa_nn_elm_atan2_f32(
FLOAT32 * z,
const FLOAT32 * y,
const FLOAT32 * x,
WORD32 N );

extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(
FLOAT32* __restrict__ p_out,
Expand Down Expand Up @@ -70,13 +71,27 @@ extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
WORD32 mode);

extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape,
WORD32 mode);

extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
const FLOAT32 * __restrict__ p_inp1,
const FLOAT32 * __restrict__ p_inp2,
WORD32 num_elm,
WORD32 kernel_type);

extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(WORD8 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape,
WORD32 kernel_type);

extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
FLOAT32* __restrict__ p_out,
Expand Down
6 changes: 6 additions & 0 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,18 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bmm.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_clamp.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_eq.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ge.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_gt.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_le.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_lt.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mm.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
Expand Down
180 changes: 180 additions & 0 deletions backends/cadence/hifi/operators/op_eq.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/kernels/portable/cpu/util/functional_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::Scalar;
using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::CppTypeToScalarType;
using torch::executor::Error;

namespace impl {
namespace HiFi {
namespace native {

Tensor& eq_tensor_out(
RuntimeContext& ctx,
const Tensor& a,
const Tensor& b,
Tensor& out) {
ET_KERNEL_CHECK(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out);

ScalarType a_type = a.scalar_type();
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();

constexpr auto name = "eq.Tensor_out";
constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */

int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
bool optimized = 1;
/*find broadcast*/
const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
max_dim = out.dim() > max_dim ? out.dim() : max_dim;

if (out_type != ScalarType::Float)
optimized = 0;

if ((a_dim == 0) || (b_dim == 0))
optimized = 0;

if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
optimized = 0;

if (optimized) {
int8_t* __restrict__ p_out =
(int8_t* __restrict__)out.mutable_data_ptr<int8_t>();
const float* __restrict__ p_inp1 =
(const float* __restrict__)a.const_data_ptr<float>();
const float* __restrict__ p_inp2 =
(const float* __restrict__)b.const_data_ptr<float>();

if (broadcast) {
int out_shape[kNnlibMaxDim];
int inp1_shape[kNnlibMaxDim];
int inp2_shape[kNnlibMaxDim];

for (int i = 0; i < kNnlibMaxDim; i++) {
inp1_shape[i] = 1;
inp2_shape[i] = 1;
out_shape[i] = 1;
}

int off_o = kNnlibMaxDim - out.dim();
int off_a = kNnlibMaxDim - a.dim();
int off_b = kNnlibMaxDim - b.dim();

for (int i = 0; i < out.dim(); i++)
out_shape[i + off_o] = out.size(i);
for (int i = 0; i < a.dim(); i++)
inp1_shape[i + off_a] = a.size(i);
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 4);
} else {
int num_elm = out.numel();

xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 4);
}

return out;
}

ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, name, CTYPE_A, [&]() {
ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, name, CTYPE_B, [&]() {
using CTYPE_IN =
typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
ET_DCHECK(
CppTypeToScalarType<CTYPE_IN>::value == promoteTypes(a_type, b_type));
ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, name, CTYPE_OUT, [&]() {
torch::executor::
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
[](const CTYPE_A val_a, const CTYPE_B val_b) {
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
bool value = a_casted == b_casted;
return static_cast<CTYPE_OUT>(value);
},
a,
b,
out);
});
});
});

return out;
}

Tensor& eq_scalar_out(
RuntimeContext& ctx,
const Tensor& a,
const Scalar& b,
Tensor& out) {
(void)ctx;

// Resize for dynamic shape
ET_KERNEL_CHECK_MSG(
ctx,
resize_tensor(out, a.sizes()) == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");

constexpr auto name = "eq.Scalar_out";

ScalarType a_type = a.scalar_type();
ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b);
ScalarType out_type = out.scalar_type();

ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, name, CTYPE_A, [&]() {
ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
using CTYPE_IN =
typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
ET_DCHECK(
CppTypeToScalarType<CTYPE_IN>::value == promoteTypes(a_type, b_type));
ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, name, CTYPE_OUT, [&]() {
CTYPE_B val_b = 0;
torch::executor::native::utils::extract_scalar(b, &val_b);
torch::executor::apply_unary_map_fn(
[val_b](const CTYPE_A val_a) {
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
bool value = a_casted == b_casted;
return static_cast<CTYPE_OUT>(value);
},
a.const_data_ptr<CTYPE_A>(),
out.mutable_data_ptr<CTYPE_OUT>(),
out.numel());
});
});
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
Loading

0 comments on commit fe91c10

Please sign in to comment.