Skip to content

Commit

Permalink
Adding atan2 operator kernel optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
Rushi-cad committed Oct 22, 2024
1 parent 81befff commit 491cad7
Show file tree
Hide file tree
Showing 4 changed files with 907 additions and 45 deletions.
1 change: 1 addition & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
Expand Down
5 changes: 5 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape);

extern "C" void xa_nn_elm_atan2_f32(FLOAT32 * z,
const FLOAT32 * y,
const FLOAT32 * x,
WORD32 N );

extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
Expand Down
64 changes: 19 additions & 45 deletions backends/cadence/hifi/operators/op_atan2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Tensor& atan2_out(

WORD32 num_elm = out.numel();

if (!optimized) {
if (optimized) {
if (broadcast) {
WORD32* __restrict__ ptr1 =
(WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
Expand All @@ -70,26 +70,16 @@ Tensor& atan2_out(
WORD32* __restrict__ pin2 =
(WORD32* __restrict__)b.const_data_ptr<float>();

WORD32 p_out_shape[max_dim];
WORD32 p_inp1_shape[max_dim];
WORD32 p_inp2_shape[max_dim];

for (int i = 0; i < kNnlibMaxDim; i++) {
p_inp1_shape[i] = 1;
p_inp2_shape[i] = 1;
p_out_shape[i] = 1;
}

int off_o = max_dim - out_dim;
int off_a = max_dim - a_dim;
int off_b = max_dim - b_dim;
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_inp1_shape[kNnlibMaxDim];
WORD32 p_inp2_shape[kNnlibMaxDim];

for (int i = 0; i < out_dim; i++)
p_out_shape[i + off_o] = out.size(i);
p_out_shape[i] = out.size(i);
for (int i = 0; i < a_dim; i++)
p_inp1_shape[i + off_a] = a.size(i);
p_inp1_shape[i] = a.size(i);
for (int i = 0; i < b_dim; i++)
p_inp2_shape[i + off_b] = b.size(i);
p_inp2_shape[i] = b.size(i);

xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);

Expand All @@ -100,7 +90,7 @@ Tensor& atan2_out(
const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;

vecatan2f(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
free(ptr2);
Expand All @@ -111,21 +101,13 @@ Tensor& atan2_out(
FLOAT32* __restrict__ pin1 =
(FLOAT32* __restrict__)a.const_data_ptr<float>();

WORD32 p_out_shape[max_dim];
WORD32 p_inp1_shape[max_dim];

for (int i = 0; i < max_dim; i++) {
p_inp1_shape[i] = 1;
p_out_shape[i] = 1;
}

int off_o = max_dim - out_dim;
int off_a = max_dim - a_dim;
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_inp1_shape[kNnlibMaxDim];

for (int i = 0; i < out_dim; i++)
p_out_shape[i + off_o] = out.size(i);
p_out_shape[i] = out.size(i);
for (int i = 0; i < a_dim; i++)
p_inp1_shape[i + off_a] = a.size(i);
p_inp1_shape[i] = a.size(i);

xa_nn_broadcast_32_32(
(WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim);
Expand All @@ -136,7 +118,7 @@ Tensor& atan2_out(
const FLOAT32* __restrict__ p_inp2 =
(const FLOAT32* __restrict__)b.const_data_ptr<float>();

vecatan2f(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
} else if (b_is_broadcasted && (!a_is_broadcasted)) {
Expand All @@ -146,21 +128,13 @@ Tensor& atan2_out(
WORD32* __restrict__ pin1 =
(WORD32* __restrict__)b.const_data_ptr<float>();

WORD32 p_out_shape[max_dim];
WORD32 p_inp1_shape[max_dim];

for (int i = 0; i < max_dim; i++) {
p_inp1_shape[i] = 1;
p_out_shape[i] = 1;
}

int off_o = max_dim - out_dim;
int off_b = max_dim - b_dim;
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_inp1_shape[kNnlibMaxDim];

for (int i = 0; i < out_dim; i++)
p_out_shape[i + off_o] = out.size(i);
p_out_shape[i] = out.size(i);
for (int i = 0; i < b_dim; i++)
p_inp1_shape[i + off_b] = b.size(i);
p_inp1_shape[i] = b.size(i);

xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);

Expand All @@ -170,7 +144,7 @@ Tensor& atan2_out(
(const FLOAT32* __restrict__)a.const_data_ptr<float>();
const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;

vecatan2f(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);

free(ptr1);
} else {
Expand All @@ -181,7 +155,7 @@ Tensor& atan2_out(
const FLOAT32* __restrict__ p_inp2 =
(const FLOAT32* __restrict__)b.const_data_ptr<float>();

vecatan2f(p_out, p_inp1, p_inp2, num_elm);
xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);
}
return out;
}
Expand Down
Loading

0 comments on commit 491cad7

Please sign in to comment.