Adding sub, sigmoid, permute, view_copy ops (#6)

dijopaul · Aug 1, 2024 · f4cf6c8 · f4cf6c8
1 parent a445497
commit f4cf6c8
Show file tree

Hide file tree

Showing 8 changed files with 777 additions and 2 deletions.
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -11,6 +11,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c 
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_broadcast_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c
@@ -25,4 +26,4 @@ target_include_directories(
          ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
 )
 
-target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
+target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -63,6 +63,14 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__
                       const WORD32 *const p_inp1_shape,
                       const FLOAT32 * __restrict__ p_inp2,
                       const WORD32 *const p_inp2_shape);
+
+extern "C" WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
+                    ,const WORD32 *const p_out_shape
+                    ,const WORD32 * __restrict__ p_inp
+                    ,const WORD32 *const p_inp_shape
+                    ,const WORD32 * __restrict__ p_permute_vec
+                    ,WORD32 num_out_dims
+                    ,WORD32 num_inp_dims);
 
 namespace impl {
 namespace HiFi {

diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -23,6 +23,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
@@ -38,6 +39,8 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
@@ -46,7 +49,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"

diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using SizesType = exec_aten::SizesType;
+using Tensor = exec_aten::Tensor;
+using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+
+namespace {
+
+void increment_coordinate_permuted(
+    const Tensor& tensor,
+    size_t* const coordinate,
+    IntArrayRef dims) {
+  for (int i = dims.size() - 1; i >= 0; i--) {
+    size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
+    coordinate[d]++;
+    if (coordinate[d] == tensor.size(d)) {
+      coordinate[d] = 0;
+    } else {
+      return;
+    }
+  }
+}
+
+} // namespace
+
+Tensor& permute_copy_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dims,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_permute_copy_out_target_size(
+      in, dims, expected_out_size, &expected_out_dim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  const auto in_type = out.scalar_type();
+
+  if(in_type == ScalarType::Float)
+  {
+    WORD32 * p_inp = (WORD32 *)in.const_data_ptr<float>();
+    WORD32 * p_out = (WORD32 *)out.mutable_data_ptr<float>();
+
+    WORD32 num_inp_dims = in.dim();
+    WORD32 num_out_dims = num_inp_dims;
+
+    WORD32 p_inp_shape[5];
+    WORD32 p_out_shape[5];
+    WORD32 p_permute_vec[5];
+
+    for(int i = 0; i < num_inp_dims; i++)
+    {
+      p_inp_shape[i] = in.size(i);
+      p_out_shape[i] = in.size(dims[i]);
+      p_permute_vec[i] = dims[i];
+    }
+
+    WORD32 val = xa_nn_transpose_32_32(p_out
+                                      ,p_out_shape
+                                      ,p_inp
+                                      ,p_inp_shape
+                                      ,p_permute_vec
+                                      ,num_out_dims
+                                      ,num_inp_dims);
+
+  }
+  else if(in_type == ScalarType::Char)
+  {
+    WORD8 * p_inp = (WORD8 *)in.const_data_ptr<char>();
+    WORD8 * p_out = (WORD8 *)out.mutable_data_ptr<char>();
+
+    WORD32 num_inp_dims = in.dim();
+    WORD32 num_out_dims = num_inp_dims;
+
+    WORD32 p_inp_shape[5];
+    WORD32 p_out_shape[5];
+    WORD32 p_permute_vec[5];
+
+    for(int i = 0; i < num_inp_dims; i++)
+    {
+      p_inp_shape[i] = in.size(i);
+      p_out_shape[i] = in.size(dims[i]);
+      p_permute_vec[i] = dims[i];
+    }
+
+    p_inp_shape[num_inp_dims] = 4;
+    p_out_shape[num_inp_dims] = 4;
+
+
+    WORD32 val = xa_nn_transpose_8_8(p_out
+                                      ,p_out_shape
+                                      ,p_inp
+                                      ,p_inp_shape
+                                      ,p_permute_vec
+                                      ,num_out_dims
+                                      ,num_inp_dims);
+
+  }
+  else
+  {
+        // in and out must be the same dtype
+    ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] {
+      const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+      CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+      size_t in_coord[kTensorDimensionLimit] = {0};
+
+      for (size_t i = 0; i < out.numel(); ++i) {
+        out_data[i] = in_data[coordinateToIndex(in, in_coord)];
+        increment_coordinate_permuted(in, in_coord, dims);
+      }
+    });
+
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType in_type = in.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  if(in_type == ScalarType::Float)
+  {
+    float* data_in = in.mutable_data_ptr<float>();
+    float* data_out = out.mutable_data_ptr<float>();
+    xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel());
+  }
+  else
+  {
+    ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
+      ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
+        apply_unary_map_fn(
+            [](const CTYPE_IN val_in) {
+              // perform math in double to preserve precision
+              double in_casted = static_cast<double>(val_in);
+              double out_val = 1.0 / (1.0 + exp(-in_casted));
+              return static_cast<CTYPE_OUT>(out_val);
+            },
+            in.const_data_ptr<CTYPE_IN>(),
+            out.mutable_data_ptr<CTYPE_OUT>(),
+            in.numel());
+      });
+    }); 
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch