From 6eff57b98a0952bc98cd280cb5b05cd96d8ff13b Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Thu, 24 Oct 2024 13:17:12 +0530 Subject: [PATCH] Adding permute_copy operator kernel optimization (#21) * Adding permute_copy operator kernel optimization * Adding permute_copy operator kernel optimization * Code cleanup --------- Co-authored-by: dijopaul <87994875+dijopaul@users.noreply.github.com> --- backends/cadence/aot/functions_hifi.yaml | 2 +- .../cadence/hifi/operators/CMakeLists.txt | 2 +- .../hifi/operators/op_permute_copy.cpp | 189 ++++++++++++++++++ 3 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_permute_copy.cpp diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 9b62b491fa..c32d17f84b 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -160,7 +160,7 @@ - op: permute_copy.out kernels: - arg_meta: null - kernel_name: torch::executor::permute_copy_out + kernel_name: impl::HiFi::permute_copy_out - op: sigmoid.out kernels: diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index ec01dfe559..5175c52dba 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -36,6 +36,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mm.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" @@ -45,7 +46,6 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp new file mode 100644 index 0000000000..5348309867 --- /dev/null +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::SizesType; +using exec_aten::Tensor; +using executorch::runtime::IntArrayRef; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::kTensorDimensionLimit; +using torch::executor::Error; + +namespace impl { +namespace HiFi { +namespace native { + +namespace { + +void increment_coordinate_permuted( + const Tensor& tensor, + size_t* const coordinate, + IntArrayRef dims) { + for (int i = dims.size() - 1; i >= 0; i--) { + size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim(); + coordinate[d]++; + if (coordinate[d] == tensor.size(d)) { + coordinate[d] = 0; + } else { + return; + } + } +} + +} // namespace + +Tensor& permute_copy_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef dims, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_permute_copy_out_target_size( + in, dims, expected_out_size, &expected_out_dim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + const auto in_type = out.scalar_type(); + + constexpr auto name = "permute_copy.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = 0; + + if (out.scalar_type() == ScalarType::Float) + optimized = 1; + else if (out.scalar_type() == ScalarType::Char) + optimized = 1; + else if (out.scalar_type() == ScalarType::Byte) + optimized = 1; + + if (in.dim() > kNnlibMaxDim) + optimized = 0; + + if (optimized) { + if (in_type == ScalarType::Float) { + WORD32* p_inp = (WORD32*)in.const_data_ptr(); + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + xa_nn_transpose_32_32( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + return out; + } else if (in_type == ScalarType::Char) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + } else if (in_type == ScalarType::Byte) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + } + return out; + } + + size_t in_coord[kTensorDimensionLimit] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + + // in and out must be the same dtype + ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + for (size_t i = 0; i < out.numel(); ++i) { + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; + increment_coordinate_permuted(in, in_coord, dims); + } + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl