Skip to content

Commit

Permalink
Adding permute_copy operator kernel optimization (#21)
Browse files Browse the repository at this point in the history
* Adding permute_copy operator kernel optimization

* Adding permute_copy operator kernel optimization

* Code cleanup

---------

Co-authored-by: dijopaul <[email protected]>
  • Loading branch information
Rushi-cad and dijopaul authored Oct 24, 2024
1 parent fe91c10 commit 6eff57b
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 2 deletions.
2 changes: 1 addition & 1 deletion backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
- op: permute_copy.out
kernels:
- arg_meta: null
kernel_name: torch::executor::permute_copy_out
kernel_name: impl::HiFi::permute_copy_out

- op: sigmoid.out
kernels:
Expand Down
2 changes: 1 addition & 1 deletion backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mm.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
Expand All @@ -45,7 +46,6 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
Expand Down
189 changes: 189 additions & 0 deletions backends/cadence/hifi/operators/op_permute_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cadence/hifi/kernels/kernels.h>
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>

using exec_aten::ScalarType;
using exec_aten::SizesType;
using exec_aten::Tensor;
using executorch::runtime::IntArrayRef;
using executorch::runtime::KernelRuntimeContext;
using executorch::runtime::kTensorDimensionLimit;
using torch::executor::Error;

namespace impl {
namespace HiFi {
namespace native {

namespace {

void increment_coordinate_permuted(
const Tensor& tensor,
size_t* const coordinate,
IntArrayRef dims) {
for (int i = dims.size() - 1; i >= 0; i--) {
size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
coordinate[d]++;
if (coordinate[d] == tensor.size(d)) {
coordinate[d] = 0;
} else {
return;
}
}
}

} // namespace

Tensor& permute_copy_out(
KernelRuntimeContext& ctx,
const Tensor& in,
IntArrayRef dims,
Tensor& out) {
(void)ctx;

ET_KERNEL_CHECK(
ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);

ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);

Tensor::SizesType expected_out_size[kTensorDimensionLimit];
size_t expected_out_dim = 0;
get_permute_copy_out_target_size(
in, dims, expected_out_size, &expected_out_dim);
ET_KERNEL_CHECK(
ctx,
resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
InvalidArgument,
out);

const auto in_type = out.scalar_type();

constexpr auto name = "permute_copy.out";
constexpr int kNnlibMaxDim = 16;

bool optimized = 0;

if (out.scalar_type() == ScalarType::Float)
optimized = 1;
else if (out.scalar_type() == ScalarType::Char)
optimized = 1;
else if (out.scalar_type() == ScalarType::Byte)
optimized = 1;

if (in.dim() > kNnlibMaxDim)
optimized = 0;

if (optimized) {
if (in_type == ScalarType::Float) {
WORD32* p_inp = (WORD32*)in.const_data_ptr<float>();
WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();

WORD32 num_inp_dims = in.dim();
WORD32 num_out_dims = num_inp_dims;

WORD32 p_inp_shape[kNnlibMaxDim];
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_permute_vec[kNnlibMaxDim];

for (int i = 0; i < num_inp_dims; i++) {
p_inp_shape[i] = in.size(i);
p_out_shape[i] = in.size(dims[i]);
p_permute_vec[i] = dims[i];
}

xa_nn_transpose_32_32(
p_out,
p_out_shape,
p_inp,
p_inp_shape,
p_permute_vec,
num_out_dims,
num_inp_dims);

return out;
} else if (in_type == ScalarType::Char) {
WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();

WORD32 num_inp_dims = in.dim();
WORD32 num_out_dims = num_inp_dims;

WORD32 p_inp_shape[kNnlibMaxDim];
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_permute_vec[kNnlibMaxDim];

for (int i = 0; i < num_inp_dims; i++) {
p_inp_shape[i] = in.size(i);
p_out_shape[i] = in.size(dims[i]);
p_permute_vec[i] = dims[i];
}

xa_nn_transpose_8_8(
p_out,
p_out_shape,
p_inp,
p_inp_shape,
p_permute_vec,
num_out_dims,
num_inp_dims);

} else if (in_type == ScalarType::Byte) {
WORD8* p_inp = (WORD8*)in.const_data_ptr<uint8_t>();
WORD8* p_out = (WORD8*)out.mutable_data_ptr<uint8_t>();

WORD32 num_inp_dims = in.dim();
WORD32 num_out_dims = num_inp_dims;

WORD32 p_inp_shape[kNnlibMaxDim];
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_permute_vec[kNnlibMaxDim];

for (int i = 0; i < num_inp_dims; i++) {
p_inp_shape[i] = in.size(i);
p_out_shape[i] = in.size(dims[i]);
p_permute_vec[i] = dims[i];
}

xa_nn_transpose_8_8(
p_out,
p_out_shape,
p_inp,
p_inp_shape,
p_permute_vec,
num_out_dims,
num_inp_dims);
}
return out;
}

size_t in_coord[kTensorDimensionLimit] = {0};
size_t trailing_dims_memo[kTensorDimensionLimit];
executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);

// in and out must be the same dtype
ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] {
const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();

for (size_t i = 0; i < out.numel(); ++i) {
out_data[i] =
in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
in, in_coord, trailing_dims_memo)];
increment_coordinate_permuted(in, in_coord, dims);
}
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl

0 comments on commit 6eff57b

Please sign in to comment.