Adding pow operator kernel optimization

dijopaul · Oct 22, 2024 · c3dd9aa · c3dd9aa
1 parent 7c8f1e3
commit c3dd9aa
Show file tree

Hide file tree

Showing 4 changed files with 1,167 additions and 20 deletions.
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -14,6 +14,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
 )

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -82,6 +82,11 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict_
                                 const WORD32 *const p_inp2_shape,
                                 const unsigned char *__restrict__ p_condition,
                                 const WORD32 *const p_condition_shape);
+
+extern "C" void xa_nn_elm_pow_f32(FLOAT32 * restrict z, 
+                                const FLOAT32 * restrict x, 
+                                const FLOAT32 * restrict y, 
+                                WORD32 N );                                
 
 namespace impl {
 namespace HiFi {

diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
@@ -125,26 +125,16 @@ Tensor& pow_Tensor_Tensor_out(
       WORD32* __restrict__ pin2 =
           (WORD32* __restrict__)b.const_data_ptr<float>();
 
-      WORD32 p_out_shape[max_dim];
-      WORD32 p_inp1_shape[max_dim];
-      WORD32 p_inp2_shape[max_dim];
-
-      for (int i = 0; i < max_dim; i++) {
-        p_inp1_shape[i] = 1;
-        p_inp2_shape[i] = 1;
-        p_out_shape[i] = 1;
-      }
-
-      int off_o = max_dim - out_dim;
-      int off_a = max_dim - a_dim;
-      int off_b = max_dim - b_dim;
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+      WORD32 p_inp2_shape[kNnlibMaxDim];
 
       for (int i = 0; i < out_dim; i++)
-        p_out_shape[i + off_o] = out.size(i);
+        p_out_shape[i] = out.size(i);
       for (int i = 0; i < a_dim; i++)
-        p_inp1_shape[i + off_a] = a.size(i);
+        p_inp1_shape[i] = a.size(i);
       for (int i = 0; i < b_dim; i++)
-        p_inp2_shape[i + off_b] = b.size(i);
+        p_inp2_shape[i] = b.size(i);
 
       xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
 
@@ -155,7 +145,7 @@ Tensor& pow_Tensor_Tensor_out(
       const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
       const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;
 
-      vecpowf(p_out, p_inp1, p_inp2, num_elm);
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
 
       free(ptr1);
       free(ptr2);
@@ -191,7 +181,7 @@ Tensor& pow_Tensor_Tensor_out(
       const FLOAT32* __restrict__ p_inp2 =
           (const FLOAT32* __restrict__)b.const_data_ptr<float>();
 
-      vecpowf(p_out, p_inp1, p_inp2, num_elm);
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
 
       free(ptr1);
     } else if (b_is_broadcasted && (!a_is_broadcasted)) {
@@ -225,7 +215,7 @@ Tensor& pow_Tensor_Tensor_out(
           (const FLOAT32* __restrict__)a.const_data_ptr<float>();
       const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;
 
-      vecpowf(p_out, p_inp1, p_inp2, num_elm);
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
 
       free(ptr1);
     } else {
@@ -236,7 +226,7 @@ Tensor& pow_Tensor_Tensor_out(
       const FLOAT32* __restrict__ p_inp2 =
           (const FLOAT32* __restrict__)b.const_data_ptr<float>();
 
-      vecpowf(p_out, p_inp1, p_inp2, num_elm);
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
     }
     return out;
   }