Skip to content

Commit

Permalink
Adding mean and where ops optimized on HiFi
Browse files Browse the repository at this point in the history
  • Loading branch information
dijopaul committed Oct 23, 2024
1 parent cb0f53e commit 216389c
Show file tree
Hide file tree
Showing 8 changed files with 1,870 additions and 10 deletions.
7 changes: 6 additions & 1 deletion backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
- arg_meta: null
kernel_name: torch::executor::full_out

- op: mean.out
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::mean_dim_out

- op: mul.out
kernels:
- arg_meta: null
Expand Down Expand Up @@ -105,7 +110,7 @@
- op: where.self_out
kernels:
- arg_meta: null
kernel_name: torch::executor::where_out
kernel_name: cadence::impl::HiFi::where_out

# custom ops
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
Expand Down
2 changes: 2 additions & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
Expand Down
28 changes: 28 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
const unsigned char* __restrict__ p_condition,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
const unsigned char* __restrict__ p_condition,
const WORD32* const p_condition_shape);

extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_axis,
WORD32 num_out_dims,
WORD32 num_inp_dims,
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

namespace cadence {
namespace impl {
namespace HiFi {
Expand Down
12 changes: 3 additions & 9 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,12 @@ endif()
set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
Expand All @@ -57,6 +50,7 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
)
add_library(aten_ops_cadence ${_aten_ops__srcs})
target_link_libraries(aten_ops_cadence PUBLIC executorch)
Expand Down
170 changes: 170 additions & 0 deletions backends/cadence/hifi/operators/op_mean.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::ArrayRef;
using torch::executor::Error;
using torch::executor::optional;

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

int prepare_data(
const Tensor& in,
Tensor& out,
optional<ArrayRef<int64_t>> dim_list,
int* inp_shape,
int* out_shape,
int* p_axis,
int num_inp_dims,
int num_out_dims) {
for (int i = 0; i < num_inp_dims; i++) {
inp_shape[i] = in.size(i);
}

for (int i = 0; i < num_out_dims; i++) {
out_shape[i] = out.size(i);
}

int num_axis_dims = 0;
for (const auto& d : dim_list.value()) {
if (d < 0) {
p_axis[num_axis_dims] = num_inp_dims + d;
num_axis_dims++;
} else {
p_axis[num_axis_dims] = d;
num_axis_dims++;
}
}

return num_axis_dims;
}

Tensor& mean_dim_out(
RuntimeContext& ctx,
const Tensor& in,
optional<ArrayRef<int64_t>> dim_list,
bool keepdim,
optional<ScalarType> dtype,
Tensor& out) {
ET_KERNEL_CHECK(
ctx,
torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
InvalidArgument,
out);

ET_KERNEL_CHECK(
ctx,
torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
Error::Ok,
InvalidArgument,
out);

constexpr auto name = "mean.out";
constexpr int kNnlibMaxDim = 4;

bool optimized = 1;

if (out.scalar_type() != ScalarType::Float)
optimized = 0;

if (in.dim() > kNnlibMaxDim)
optimized = 0;

if (optimized) {
float* __restrict__ p_out = out.mutable_data_ptr<float>();
const float* __restrict__ p_inp =
(const float* __restrict__)in.const_data_ptr<float>();

int num_elm = in.numel();

int num_inp_dims = in.dim();
int num_out_dims = out.dim();

int inp_shape[kNnlibMaxDim];
int out_shape[kNnlibMaxDim];
int p_axis[kNnlibMaxDim];

for (int i = 0; i < kNnlibMaxDim; i++) {
out_shape[i] = 1;
inp_shape[i] = 1;
p_axis[i] = 1;
}

int num_axis_dims = prepare_data(
in,
out,
dim_list,
inp_shape,
out_shape,
p_axis,
num_inp_dims,
num_out_dims);

if (num_axis_dims == num_inp_dims) {
num_out_dims = 1;
out_shape[0] = 1;
}

int scratch_size = xa_nn_reduce_getsize_nhwc(
-3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);

void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);

xa_nn_reduce_mean_4D_f32_f32(
p_out,
out_shape,
p_inp,
inp_shape,
p_axis,
num_out_dims,
num_inp_dims,
num_axis_dims,
p_scratch_in);

return out;
}

ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);

for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
CTYPE_OUT sum = 0;
if (in.numel() > 0) {
sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
[](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
[](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
in,
dim_list,
out_ix);
}
out_data[out_ix] = sum / static_cast<float>(num);
}
});
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Loading

0 comments on commit 216389c

Please sign in to comment.