Adding cat, full, permute_copy and relu ops (#34)

* Adding cat, full, permute_copy
dijopaul · Nov 18, 2024 · d730ed8 · d730ed8
1 parent 07743ab
commit d730ed8
Show file tree

Hide file tree

Showing 10 changed files with 1,025 additions and 9 deletions.
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -35,7 +35,7 @@
 - op: cat.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::cat_out
+      kernel_name: cadence::impl::HiFi::cat_out
 
 - op: clone.out
   kernels:
@@ -60,7 +60,7 @@
 - op: full.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::full_out
+      kernel_name: cadence::impl::HiFi::full_out
 
 - op: maximum.out
   kernels:
@@ -70,7 +70,7 @@
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out   
+      kernel_name: cadence::impl::HiFi::mean_dim_out
 
 - op: minimum.out
   kernels:
@@ -85,7 +85,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
+      kernel_name: cadence::impl::HiFi::permute_copy_out
 
 - op: pow.Scalar_out
   kernels:
@@ -155,7 +155,6 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out
 
-
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -165,3 +164,8 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_out
+
+- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_out
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -10,6 +10,7 @@ add_library(
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
@@ -18,6 +19,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -23,6 +23,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32(
     const int* const in_shape,
     int num_dims);
 
+extern "C" WORD32 xa_nn_concat_32_32(
+    WORD32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const WORD32** pp_inps,
+    const WORD32* const* pp_inps_shape,
+    WORD32 num_out_dims,
+    WORD32 num_inp,
+    WORD32 num_inp_dims,
+    WORD32 axis);
+
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -125,6 +135,15 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
     WORD32 num_axis_dims,
     void* __restrict__ p_scratch_in);
 
+extern "C" WORD32 xa_nn_transpose_32_32(
+    WORD32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const WORD32* __restrict__ p_inp,
+    const WORD32* const p_inp_shape,
+    const WORD32* __restrict__ p_permute_vec,
+    WORD32 num_out_dims,
+    WORD32 num_inp_dims);
+
 namespace cadence {
 namespace impl {
 namespace HiFi {

diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -21,23 +21,23 @@ endif()
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
@@ -71,7 +71,7 @@ target_include_directories(
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
-             "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
+             "quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}

diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cstring>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::getLeadingDims;
+using executorch::runtime::getTrailingDims;
+using executorch::runtime::resize_tensor;
+using executorch::runtime::tensors_have_same_dim_order;
+using torch::executor::check_cat_args;
+using torch::executor::Error;
+using torch::executor::get_cat_out_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+Tensor& cat_out(
+    RuntimeContext& ctx,
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  constexpr auto name = "cat.out";
+  constexpr int kNnlibMaxDim = 16;
+
+  bool optimized = true;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = false;
+
+  if (optimized) {
+    WORD32 num_inp = tensors.size();
+    WORD32 num_inp_dims = out.dim();
+    WORD32 num_out_dims = num_inp_dims;
+    WORD32 axis = dim;
+
+    WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim];
+    WORD32 p_out_shape[kNnlibMaxDim];
+
+    WORD32* ptr_shape[kNnlibMaxDim];
+    const WORD32* ptr[kNnlibMaxDim];
+
+    int k = 0;
+    for (int i = 0; i < num_inp; i++) {
+      if (tensors[i].numel() == 0)
+        continue;
+      ptr[k] = (const WORD32*)tensors[i].const_data_ptr<float>();
+      for (int j = 0; j < num_inp_dims; j++) {
+        inp_shape[k][j] = tensors[i].size(j);
+      }
+      ptr_shape[k] = inp_shape[k];
+      k++;
+    }
+
+    num_inp = k;
+
+    for (int i = 0; i < num_out_dims; i++) {
+      p_out_shape[i] = out.size(i);
+    }
+
+    const WORD32** pp_inps = &ptr[0];
+
+    WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
+
+    const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0];
+
+    WORD32 ret_val = xa_nn_concat_32_32(
+        p_out,
+        p_out_shape,
+        pp_inps,
+        pp_inps_shape,
+        num_out_dims,
+        num_inp,
+        num_inp_dims,
+        axis);
+
+    ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+
+    return out;
+  }
+
+  if (dim < 0) {
+    dim += out.dim();
+  }
+
+  ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out);
+
+  Tensor::SizesType
+      expected_out_size[executorch::runtime::kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Special handling when all inputs are 1D-empty tensors for aten consistency
+  // In that case, just return an 1D-empty tensor without checking dim
+  bool all_1d_empty = true;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
+      all_1d_empty = false;
+      break;
+    }
+  }
+  if (all_1d_empty) {
+    return out;
+  }
+
+  const size_t outer = getLeadingDims(out, dim);
+  const size_t dim_stride = getTrailingDims(out, dim);
+  const size_t ninputs = tensors.size();
+
+  const auto out_type = out.scalar_type();
+  ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
+    CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+    for (size_t i = 0; i < outer; ++i) {
+      for (size_t j = 0; j < ninputs; ++j) {
+        const auto in_type = tensors[j].scalar_type();
+        ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
+          if (tensors[j].numel() == 0) {
+            return;
+          }
+          size_t inner = tensors[j].size(dim) * dim_stride;
+          const CTYPE_IN* const in_ptr =
+              tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
+
+          for (size_t k = 0; k < inner; ++k) {
+            out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
+          }
+          out_ptr += inner;
+        });
+      }
+    }
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence