Skip to content

Commit

Permalink
Adding pow operator kernel optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
Rushi-cad committed Oct 22, 2024
1 parent 7c8f1e3 commit c3dd9aa
Show file tree
Hide file tree
Showing 4 changed files with 1,167 additions and 20 deletions.
1 change: 1 addition & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
)
Expand Down
5 changes: 5 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict_
const WORD32 *const p_inp2_shape,
const unsigned char *__restrict__ p_condition,
const WORD32 *const p_condition_shape);

extern "C" void xa_nn_elm_pow_f32(FLOAT32 * restrict z,
const FLOAT32 * restrict x,
const FLOAT32 * restrict y,
WORD32 N );

namespace impl {
namespace HiFi {
Expand Down
30 changes: 10 additions & 20 deletions backends/cadence/hifi/operators/op_pow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,26 +125,16 @@ Tensor& pow_Tensor_Tensor_out(
WORD32* __restrict__ pin2 =
(WORD32* __restrict__)b.const_data_ptr<float>();

WORD32 p_out_shape[max_dim];
WORD32 p_inp1_shape[max_dim];
WORD32 p_inp2_shape[max_dim];

for (int i = 0; i < max_dim; i++) {
p_inp1_shape[i] = 1;
p_inp2_shape[i] = 1;
p_out_shape[i] = 1;
}

int off_o = max_dim - out_dim;
int off_a = max_dim - a_dim;
int off_b = max_dim - b_dim;
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_inp1_shape[kNnlibMaxDim];
WORD32 p_inp2_shape[kNnlibMaxDim];

for (int i = 0; i < out_dim; i++)
p_out_shape[i + off_o] = out.size(i);
p_out_shape[i] = out.size(i);
for (int i = 0; i < a_dim; i++)
p_inp1_shape[i + off_a] = a.size(i);
p_inp1_shape[i] = a.size(i);
for (int i = 0; i < b_dim; i++)
p_inp2_shape[i + off_b] = b.size(i);
p_inp2_shape[i] = b.size(i);

xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);

Expand All @@ -155,7 +145,7 @@ Tensor& pow_Tensor_Tensor_out(
const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;

vecpowf(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
free(ptr2);
Expand Down Expand Up @@ -191,7 +181,7 @@ Tensor& pow_Tensor_Tensor_out(
const FLOAT32* __restrict__ p_inp2 =
(const FLOAT32* __restrict__)b.const_data_ptr<float>();

vecpowf(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
} else if (b_is_broadcasted && (!a_is_broadcasted)) {
Expand Down Expand Up @@ -225,7 +215,7 @@ Tensor& pow_Tensor_Tensor_out(
(const FLOAT32* __restrict__)a.const_data_ptr<float>();
const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;

vecpowf(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
} else {
Expand All @@ -236,7 +226,7 @@ Tensor& pow_Tensor_Tensor_out(
const FLOAT32* __restrict__ p_inp2 =
(const FLOAT32* __restrict__)b.const_data_ptr<float>();

vecpowf(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
}
return out;
}
Expand Down
Loading

0 comments on commit c3dd9aa

Please sign in to comment.