Skip to content

Commit

Permalink
Adding cat, full, permute_copy and relu ops (#34)
Browse files Browse the repository at this point in the history
* Adding cat, full, permute_copy
  • Loading branch information
nishpoonia authored Nov 18, 2024
1 parent 07743ab commit d730ed8
Show file tree
Hide file tree
Showing 10 changed files with 1,025 additions and 9 deletions.
14 changes: 9 additions & 5 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
- op: cat.out
kernels:
- arg_meta: null
kernel_name: torch::executor::cat_out
kernel_name: cadence::impl::HiFi::cat_out

- op: clone.out
kernels:
Expand All @@ -60,7 +60,7 @@
- op: full.out
kernels:
- arg_meta: null
kernel_name: torch::executor::full_out
kernel_name: cadence::impl::HiFi::full_out

- op: maximum.out
kernels:
Expand All @@ -70,7 +70,7 @@
- op: mean.out
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::mean_dim_out
kernel_name: cadence::impl::HiFi::mean_dim_out

- op: minimum.out
kernels:
Expand All @@ -85,7 +85,7 @@
- op: permute_copy.out
kernels:
- arg_meta: null
kernel_name: torch::executor::permute_copy_out
kernel_name: cadence::impl::HiFi::permute_copy_out

- op: pow.Scalar_out
kernels:
Expand Down Expand Up @@ -155,7 +155,6 @@
- arg_meta: null
kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out


- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
Expand All @@ -165,3 +164,8 @@
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_linear_out

- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_relu_out
2 changes: 2 additions & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_library(
kernels.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
Expand All @@ -18,6 +19,7 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
Expand Down
19 changes: 19 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32(
const int* const in_shape,
int num_dims);

extern "C" WORD32 xa_nn_concat_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32** pp_inps,
const WORD32* const* pp_inps_shape,
WORD32 num_out_dims,
WORD32 num_inp,
WORD32 num_inp_dims,
WORD32 axis);

extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
Expand Down Expand Up @@ -125,6 +135,15 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

extern "C" WORD32 xa_nn_transpose_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_permute_vec,
WORD32 num_out_dims,
WORD32 num_inp_dims);

namespace cadence {
namespace impl {
namespace HiFi {
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,23 @@ endif()
# ATen compliant ops that are needed to run this model.
set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
Expand Down Expand Up @@ -71,7 +71,7 @@ target_include_directories(
# Custom ops that are needed to run the test model.
add_library(
custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
"quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
"quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
)
target_include_directories(
custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
Expand Down
158 changes: 158 additions & 0 deletions backends/cadence/hifi/operators/op_cat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <cstring>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::getLeadingDims;
using executorch::runtime::getTrailingDims;
using executorch::runtime::resize_tensor;
using executorch::runtime::tensors_have_same_dim_order;
using torch::executor::check_cat_args;
using torch::executor::Error;
using torch::executor::get_cat_out_target_size;

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

Tensor& cat_out(
RuntimeContext& ctx,
exec_aten::ArrayRef<Tensor> tensors,
int64_t dim,
Tensor& out) {
constexpr auto name = "cat.out";
constexpr int kNnlibMaxDim = 16;

bool optimized = true;

if (out.scalar_type() != ScalarType::Float)
optimized = false;

if (optimized) {
WORD32 num_inp = tensors.size();
WORD32 num_inp_dims = out.dim();
WORD32 num_out_dims = num_inp_dims;
WORD32 axis = dim;

WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim];
WORD32 p_out_shape[kNnlibMaxDim];

WORD32* ptr_shape[kNnlibMaxDim];
const WORD32* ptr[kNnlibMaxDim];

int k = 0;
for (int i = 0; i < num_inp; i++) {
if (tensors[i].numel() == 0)
continue;
ptr[k] = (const WORD32*)tensors[i].const_data_ptr<float>();
for (int j = 0; j < num_inp_dims; j++) {
inp_shape[k][j] = tensors[i].size(j);
}
ptr_shape[k] = inp_shape[k];
k++;
}

num_inp = k;

for (int i = 0; i < num_out_dims; i++) {
p_out_shape[i] = out.size(i);
}

const WORD32** pp_inps = &ptr[0];

WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();

const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0];

WORD32 ret_val = xa_nn_concat_32_32(
p_out,
p_out_shape,
pp_inps,
pp_inps_shape,
num_out_dims,
num_inp,
num_inp_dims,
axis);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);

return out;
}

if (dim < 0) {
dim += out.dim();
}

ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out);

Tensor::SizesType
expected_out_size[executorch::runtime::kTensorDimensionLimit];
size_t expected_out_dim = 0;
get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);

ET_KERNEL_CHECK(
ctx,
resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
InvalidArgument,
out);

// Special handling when all inputs are 1D-empty tensors for aten consistency
// In that case, just return an 1D-empty tensor without checking dim
bool all_1d_empty = true;
for (size_t i = 0; i < tensors.size(); ++i) {
if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
all_1d_empty = false;
break;
}
}
if (all_1d_empty) {
return out;
}

const size_t outer = getLeadingDims(out, dim);
const size_t dim_stride = getTrailingDims(out, dim);
const size_t ninputs = tensors.size();

const auto out_type = out.scalar_type();
ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
for (size_t i = 0; i < outer; ++i) {
for (size_t j = 0; j < ninputs; ++j) {
const auto in_type = tensors[j].scalar_type();
ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
if (tensors[j].numel() == 0) {
return;
}
size_t inner = tensors[j].size(dim) * dim_stride;
const CTYPE_IN* const in_ptr =
tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;

for (size_t k = 0; k < inner; ++k) {
out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
}
out_ptr += inner;
});
}
}
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Loading

0 comments on commit d730ed8

Please sign in to comment.