From 71a6da94076f428292add9f8f18836be924078a3 Mon Sep 17 00:00:00 2001
From: Rushi-cad <gherderu@cadence.com>
Date: Wed, 23 Oct 2024 13:52:12 +0530
Subject: [PATCH] Adding atan2 operator kernel optimization (#26)

* Adding atan2 operator kernel optimization

* Adding atan2 operator kernel optimization

---------

Co-authored-by: dijopaul <87994875+dijopaul@users.noreply.github.com>
---
 backends/cadence/aot/functions_hifi.yaml      |   5 +
 backends/cadence/hifi/kernels/CMakeLists.txt  |   3 +-
 backends/cadence/hifi/kernels/kernels.h       |   6 +
 .../cadence/hifi/operators/CMakeLists.txt     |   1 +
 backends/cadence/hifi/operators/op_atan2.cpp  | 185 ++++
 .../third-party/nnlib/xa_nn_broadcast_32.c    | 313 +++++++
 .../third-party/nnlib/xa_nn_elm_atan2_f32.c   | 882 ++++++++++++++++++
 7 files changed, 1394 insertions(+), 1 deletion(-)
 create mode 100644 backends/cadence/hifi/operators/op_atan2.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 47006430a3..d46550094f 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -26,6 +26,11 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::add_out
+      
+- op: atan2.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::atan2_out 
 
 - op: bmm.out
   kernels:
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 3811380559..27789135a7 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,8 +9,9 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index d728545f25..ad1da5c98e 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -16,6 +16,7 @@
 #include "xa_nnlib_kernels_api.h"
 
 /* Potential NNLIB function/APIs */
+
 extern "C" WORD32 xa_nn_broadcast_32_32(
     WORD32* __restrict__ p_out,
     const int* const out_shape,
@@ -31,6 +32,11 @@ extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     const FLOAT32* __restrict__ p_inp2,
     const WORD32* const p_inp2_shape);
 
+extern "C" void xa_nn_elm_atan2_f32(FLOAT32 * z, 
+                                const FLOAT32 * y, 
+                                const FLOAT32 * x,  
+                                WORD32 N );   
+
 extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const FLOAT32* __restrict__ p_inp,
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index da29ae4a0b..7c37239153 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -21,6 +21,7 @@ endif()
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_clamp.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp
new file mode 100644
index 0000000000..5d6f7c360a
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_atan2.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cmath>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+Tensor& atan2_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Determine output size and resize for dynamic shapes
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  constexpr auto name = "atan2.out";
+  constexpr int kNnlibMaxDim = 16;
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if (out_type != ScalarType::Float)
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+
+  WORD32 num_elm = out.numel();
+
+  if (optimized) {
+    if (broadcast) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+      WORD32* __restrict__ ptr2 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)a.const_data_ptr<float>();
+      WORD32* __restrict__ pin2 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+      WORD32 p_inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp2_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;
+
+      xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+      free(ptr2);
+    } else if (a_is_broadcasted && (!b_is_broadcasted)) {
+      FLOAT32* __restrict__ ptr1 =
+          (FLOAT32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      FLOAT32* __restrict__ pin1 =
+          (FLOAT32* __restrict__)a.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+
+      xa_nn_broadcast_32_32(
+          (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else if (b_is_broadcasted && (!a_is_broadcasted)) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp1_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;
+
+      xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else {
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm);
+    }
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_FLOATH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        torch::executor::
+            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+                [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                  CTYPE_OUT casted_a = static_cast<CTYPE_OUT>(val_a);
+                  CTYPE_OUT casted_b = static_cast<CTYPE_OUT>(val_b);
+                  return static_cast<CTYPE_OUT>(std::atan2(casted_a, casted_b));
+                },
+                a,
+                b,
+                out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
new file mode 100644
index 0000000000..cad3f1a25b
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+/*
+ * xa_nn_broadcast_8_8.c
+ */
+
+#include "xa_nnlib_common.h"
+//#include "xa_nn_basic_state.h"
+
+#include<string.h>
+#include<stdbool.h>
+
+#include "stdio.h"
+
+/*
+ * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
+ */
+
+#define NUMDIMS_MAX 8
+
+typedef struct bcast_expansion_struct_{
+    size_t load_num_elem;
+    int    replicate_loadedElm_times;
+    int    repeat_operation;
+} bcast_expansion_rule ;
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src);
+
+void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
+{
+  char *dest = (char *)dest1;
+  char *src = (char *)src1;
+  int n = (int)n1;
+  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
+  int i;
+  void *orig_dest = dest;
+
+  if (n < 32) {
+    return memcpy(dest, src, n);
+  }
+
+  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
+    s_align_addr = (ae_int16x4 *) src;
+    d_align_addr = (ae_int16x4 *) dest;
+    for (i=0; i<n>>3; i++) {
+        d_align_addr[i] = s_align_addr[i];
+    }
+
+    for (i=(n&~7); i<n; i++) {
+      dest[i] = src[i];
+    }
+    return orig_dest;
+  }
+
+  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
+    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
+      *dest++ = *src++;
+       n--;
+    } else {
+      #if 0
+      return memcpy(dest, src, n);
+      #else
+        ae_int32x2 *pOut = (ae_int32x2 *)dest;
+        ae_int32x2 *pInp = (ae_int32x2 *)src;
+        ae_valign alignIn, alignOut;
+        alignIn = AE_LA64_PP(pInp);
+        alignOut = AE_ZALIGN64();
+        ae_int24x2 d0;
+        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
+        int remainder_start = 6*Nby6;
+
+        for(i=0;i<Nby6;i++)
+        {
+          AE_LA24X2_IP(d0, alignIn, pInp);
+          AE_SA24X2_IP(d0, alignOut, pOut);
+        }
+        AE_SA64POS_FP(alignOut, pOut);
+        /* remainder loop */
+        for(i=remainder_start; i < n; i++){
+          dest[i] = src[i];
+      }
+      return orig_dest;
+      #endif
+    }
+  }
+  int n2 = n/2;
+  ae_valign d_align = AE_ZALIGN64();
+  d_align_addr = (ae_int16x4 *) dest;
+  s_align_addr = (ae_int16x4 *) src;
+  ae_valign s_align = AE_LA64_PP(s_align_addr);
+  ae_int16x4 t,t2;
+  for (i=0; i<n2>>3; i++) {
+      AE_LA16X4_IP(t, s_align, s_align_addr);
+      AE_LA16X4_IP(t2, s_align, s_align_addr);
+      AE_SA16X4_IP(t, d_align, d_align_addr);
+      AE_SA16X4_IP(t2, d_align, d_align_addr);
+  }
+  AE_SA64POS_FP(d_align, d_align_addr);
+  ae_int16 *s_src = (ae_int16 *) src;
+  ae_int16 *s_dest = (ae_int16 *) dest;
+  for (i=8*i; i<n2; i++) {
+    s_dest[i] = s_src[i];
+  }
+  if (n % 2) {
+    dest[n-1] = src[n-1];
+  }
+  return orig_dest;
+} /* xa_nn_memcpy */
+
+WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
+        const int *const out_shape,         /* output shape resulting after broadcast */
+
+        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
+        const int * const in_shape,         /* input shape */
+        int num_dims)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
+    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
+
+    /* IO shape pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
+
+    /* Check if number of dims is valid */
+    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
+
+    int i = 0;
+
+    /* Check for valid IO shapes */
+    for(i=0; i<num_dims; i++){
+        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
+        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
+    }
+
+    /* Check if input shape can be broadcasted to requested output shape */
+    for(i=0; i<num_dims; i++){
+        if(in_shape[i] != out_shape[i]){
+            /* in_shape is either same as out_shape or 1 */
+            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
+        }
+    }
+
+    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
+    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
+
+    int k=0;
+    int dim=0;
+    const void *res=0;
+
+    int num_elem_load = 1;
+    int num_copy_times = 1;
+    int num_repeat = 1;
+
+    dim = num_dims-1;
+    while(dim>=0){
+
+        /* Find the sub-matrix size */
+        while(in_shape[dim] != 1 && dim>=0){
+            num_elem_load *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times this sub-matrix needs to be copied */
+        num_copy_times = 1;
+        while(in_shape[dim] == 1 && dim>=0){
+            num_copy_times *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times the above copy needs to be repeated */
+        num_repeat = 1;
+        while(in_shape[dim] != 1 && dim>=0){
+            num_repeat *= 1 * out_shape[dim];
+            dim--;
+        }
+
+        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
+        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
+        bcast_expansion_steps[k].repeat_operation = num_repeat;
+        k++;
+
+        num_elem_load = num_elem_load * num_copy_times * num_repeat;
+    }
+
+    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
+            p_out, p_in);
+    (void)res; /* Unused return value */
+
+    return 0;
+}
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src) {
+    int step_itr=0, rep_itr=0;
+    int i=0, j=0, k=0;
+    bcast_expansion_rule *step = NULL;
+
+    // ignore steps that are null
+    while(steps[step_id].repeat_operation == 0 && step_id>0){
+        step_id--;
+    }
+
+    // step is now the parent node for this iteration
+    step = &steps[step_id];
+    size_t numLoadedElm = step->load_num_elem;
+
+    WORD32 *cp_dst = dst;
+    WORD32 *cp_src = src;
+    WORD32 *cp_src_temp=NULL;
+    WORD32 *cp_dst_temp=NULL;
+
+    if(numLoadedElm>32){
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j<step->repeat_operation; j++){
+                    for(i=0; i<step->replicate_loadedElm_times; i++){
+                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+    else{
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    for(k=0; k<(int)numLoadedElm; k++){
+                        cp_src_temp = cp_src;
+                        cp_dst_temp = cp_dst;
+                        cp_dst_temp[k] = cp_src_temp[k];
+                    }
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j < step->repeat_operation; j++){
+                    for(i=0; i < step->replicate_loadedElm_times; i++){
+                        for(k=0; k<(int)(numLoadedElm); k++){
+                            cp_src_temp = cp_src;
+                            cp_dst_temp = cp_dst;
+                            cp_dst_temp[k] = cp_src_temp[k];
+
+                        }
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
new file mode 100644
index 0000000000..6f95360ed9
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
@@ -0,0 +1,882 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ("Cadence    */
+/* Libraries") are the copyrighted works of Cadence Design Systems Inc.	    */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* DSP Library                                                              */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2015-2018 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+#include <float.h>
+
+#include "../include/NatureDSP_Signal_math.h"
+#include "NatureDSP_types.h"
+#include "xa_nn_common.h"
+
+/* Common helper macros. */
+#include "xa_nnlib_common_fpu.h"
+
+#include "xa_nnlib_common.h"
+
+const union ufloat32uint32 xa_nnlib_plusInff  ={0x7f800000};
+const union ufloat32uint32 xa_nnlib_qNaNf       = { 0x7fc00000 };
+const union ufloat32uint32 pif ={0x40490fdb}; /* pi */
+const union ufloat32uint32 pi2f={0x3fc90fdb}; /* pi/2 */
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl1[8] =
+{
+    {0x3dbc14c0},/* 9.183645248413086e-002 */
+    {0xbe30c39c},/*-1.726211905479431e-001 */
+    {0x3b2791e4},/* 2.556913532316685e-003 */
+    {0x3e4dac9d},/* 2.008537799119949e-001 */
+    {0xb97d9a57},/*-2.418545627733693e-004 */
+    {0xbeaaa7b5},/*-3.333107531070709e-001 */
+    {0xb54f34c8},/*-7.719031600572635e-007 */
+    {0x31cf3fa2} /* 6.031727117772334e-009 */
+};
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl2[8]=
+{
+    {0xbcccc037},/*-2.499399892985821e-002 */
+    {0x3e217c35},/* 1.577003747224808e-001 */
+    {0xbecf4163},/*-4.047957360744476e-001 */
+    {0x3ef7b762},/* 4.838209748268127e-001 */
+    {0xbdf35059},/*-1.188055947422981e-001 */
+    {0xbe9b8b75},/*-3.037983477115631e-001 */
+    {0xbb80ed5c},/*-3.934545442461968e-003 */
+    {0x3956fc52} /* 2.050262701231986e-004 */
+};
+
+#if !HAVE_VFPU && !HAVE_FPU
+DISCARD_FUN(void, xa_nn_elm_atan2_f32,( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x,  int N ))
+#elif HAVE_VFPU
+#define sz_f32    (int)sizeof(FLOAT32)
+
+/*===========================================================================
+  Vector matematics:
+  vec_atan2          full quadrant Arctangent        
+===========================================================================*/
+
+/*-------------------------------------------------------------------------
+  Full-Quadrant Arc Tangent
+  The functions compute the arc tangent of the ratios y[N]/x[N] and store the
+  result to output vector z[N]. 
+  Floating point functions output is in radians. Fixed point functions
+  scale its output by pi.
+
+  NOTE:
+  1. Scalar floating point function is compatible with standard ANSI C routines and set 
+     errno and exception flags accordingly
+  2. Scalar floating point function assigns EDOM to errno whenever y==0 and x==0.
+
+  Accuracy:
+  24 bit version: 768 (3.57e-7)
+  floating point: 2 ULP
+
+  Special cases:
+       y    |   x   |  result   |  extra conditions    
+    --------|-------|-----------|---------------------
+     +/-0   | -0    | +/-pi     |
+     +/-0   | +0    | +/-0      |
+     +/-0   |  x    | +/-pi     | x<0
+     +/-0   |  x    | +/-0      | x>0
+     y      | +/-0  | -pi/2     | y<0
+     y      | +/-0  |  pi/2     | y>0
+     +/-y   | -inf  | +/-pi     | finite y>0
+     +/-y   | +inf  | +/-0      | finite y>0
+     +/-inf | x     | +/-pi/2   | finite x
+     +/-inf | -inf  | +/-3*pi/4 | 
+     +/-inf | +inf  | +/-pi/4   |
+
+  Input:
+    y[N]  vector of numerator values, Q31 or floating point
+    x[N]  vector of denominator values, Q31 or floating point
+    N     length of vectors
+  Output:
+    z[N]  results, Q31 or floating point
+
+---------------------------------------------------------------------------*/
+
+void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x,  WORD32 N )
+{
+  /*
+    const union ufloat32uint32* p;
+    int sx,sy,big;
+    sx=takesignf(x);
+    sy=takesignf(y);
+    x=fabs(x);
+    y=fabs(y);
+    if(x==0.f && y==0.f)
+    {
+      // The actual result depends on input signs.
+      x = 1.f;
+      y = 0.f;
+    }
+
+    big=x>y;
+    if(big)
+    {
+        x=y/x;
+    }
+    else
+    {
+      // compare x==y is necessary to support (+/-Inf, +/-Inf) cases 
+      x = (x == y) ? 1.0f : x / y;
+    }
+    p = (x<0.5f) ? atanftbl1 : atanftbl2;
+    // approximate atan(x)/x-1 
+    y = p[0].f;
+    y = x*y + p[1].f;
+    y = x*y + p[2].f;
+    y = x*y + p[3].f;
+    y = x*y + p[4].f;
+    y = x*y + p[5].f;
+    y = x*y + p[6].f;
+    y = x*y + p[7].f;
+    // convert result to true atan(x) 
+    y = x*y + x;
+
+    if (!big) y = pi2f.f - y;
+    if (sx)   y = pif.f - y;
+    if (sy)   y = -y;
+    return   y;
+  */
+
+  const xtfloatx2 *          X;
+  const xtfloatx2 *          Y;
+        xtfloatx2 * restrict Z;
+  const xtfloatx2 *          S_rd;
+        xtfloatx2 * restrict S_wr;
+
+  ae_valign X_va, Y_va, Z_va;
+
+  /* Current block index; overall number of blocks; number of values in the current block */
+  int blkIx, blkNum, blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ/sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  FLOAT32 ALIGN(8) scr[blkSize];
+
+  int n;
+
+  if ( N<=0 ) return;
+
+  NASSERT_ALIGN8( scr );
+
+  /*
+   * Data are processed in blocks of scratch area size. Further, the algorithm
+   * implementation is splitted in order to feed the optimizing compiler with a
+   * few loops of managable size.
+   */
+
+  blkNum = ( N + blkSize-1 )/blkSize;
+
+  for ( blkIx=0; blkIx<blkNum; blkIx++ )
+  {
+    blkLen = XT_MIN( N - blkIx*blkSize, blkSize );
+
+    /*
+     * Part I, reduction to [0,pi/4]. Reference C code:
+     *
+     *   {
+     *     float32_t x0, y0, p0;
+     *   
+     *     for ( n=0; n<blkLen; n++ )
+     *     {
+     *       x0 = fabsf( x[blkIx*blkSize+n] );
+     *       y0 = fabsf( y[blkIx*blkSize+n] );
+     *   
+     *       // The actual result depends on input signs.
+     *       if ( x0==0.f && y0==0.f ) { x0 = 1.f; y0 = 0.f; };
+     *   
+     *       if ( x0>y0 ) p0 = y0/x0;
+     *       // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases.
+     *       else p0 = ( x0==y0 ? 1.f : x0/y0 );
+     *   
+     *       scr[n] = p0;
+     *     }
+     *   }
+     */
+
+    {
+      /* Input values */
+      xtfloatx2 x0, y0;
+      /* Numerator; denominator; reciprocal; quotient */
+      xtfloatx2 num, den, rcp, quo;
+      /* Scaling factor; error term */
+      xtfloatx2 scl, eps;
+      /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */
+      xtbool2 b_nan, b_num_inf, b_den_inf, b_eqz, b_subn;
+
+      X    = (xtfloatx2*)( (uintptr_t)x + blkIx*blkSize*sz_f32 );
+      Y    = (xtfloatx2*)( (uintptr_t)y + blkIx*blkSize*sz_f32 );
+      S_wr = (xtfloatx2*)scr;
+
+      X_va = XT_LASX2PP( X );
+      Y_va = XT_LASX2PP( Y );
+
+      __Pragma( "loop_count min=1" );
+      for ( n=0; n<(blkLen+1)/2; n++ )
+      {
+        XT_LASX2IP( x0, X_va, X );
+        XT_LASX2IP( y0, Y_va, Y );
+
+        /* Replicate NaNs in both x and y to ensure NaN propagation. */
+        b_nan = XT_UN_SX2( x0, y0 );
+        XT_MOVT_SX2( x0, xa_nnlib_qNaNf.f, b_nan );
+        XT_MOVT_SX2( y0, xa_nnlib_qNaNf.f, b_nan );
+
+        x0 = XT_ABS_SX2( x0 );
+        y0 = XT_ABS_SX2( y0 );
+
+        /* num <= den */
+        num = XT_MIN_SX2( x0, y0 );
+        den = XT_MAX_SX2( y0, x0 );
+
+        /* Scale up numerator and denominator if BOTH are subnormal. */
+        b_subn = XT_OLT_SX2( num, FLT_MIN );
+        scl = (xtfloatx2)8388608.f; XT_MOVF_SX2( scl, (xtfloatx2)1.0f, b_subn );
+        num = XT_MUL_SX2( num, scl );
+        den = XT_MUL_SX2( den, scl );
+
+        /* Classify numerator and denominator. */
+        b_num_inf = XT_OEQ_SX2( num, xa_nnlib_plusInff.f );           /* Inf/Inf */
+        b_den_inf = XT_OEQ_SX2( den, xa_nnlib_plusInff.f );           /* x/Inf   */
+        b_eqz = XT_OEQ_SX2( den, (xtfloatx2)(xtfloatx2)(0.0f) ); /* 0/0     */
+
+        /* Initial appromimation for 1/den. */
+        rcp = XT_RECIP0_SX2( den );
+        /* Newton-Raphson iteration for 1/den. */
+        eps = (xtfloatx2)1.0f;
+        XT_MSUB_SX2( eps, rcp, den );
+        XT_MADD_SX2( rcp, rcp, eps );
+        /* Approximation for the quotient num/den. */
+        quo = XT_MUL_SX2( num, rcp );
+        /* Refine the quotient by a modified Newton-Raphson iteration. */
+        eps = num;
+        XT_MSUB_SX2( eps, quo, den );
+        XT_MADD_SX2( quo, rcp, eps );
+
+        /* Force conventional results for special cases. */
+        XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_den_inf ); /* x/Inf -> 0   */
+        XT_MOVT_SX2( quo, (xtfloatx2)1.0f, b_num_inf ); /* Inf/Inf -> 1 */
+        XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_eqz     ); /* 0/0 -> 0     */
+
+        XT_SSX2IP( quo, S_wr, +2*sz_f32 );
+      }
+    }
+
+    __Pragma( "no_reorder" );
+
+    /*
+     * Part II, polynomial approximation and full quadrant restoration.
+     * Reference C code:
+     *
+     *   {
+     *     const union ufloat32uint32 * ptbl;
+     *     float32_t x0, y0, z0, p0;
+     *     int sx, sy;
+     *   
+     *     for ( n=0; n<blkLen; n++ )
+     *     {
+     *       x0 = x[blkIx*blkSize+n];
+     *       y0 = y[blkIx*blkSize+n];
+     *       p0 = scr[n];
+     *   
+     *       sx = takesignf( x0 ); x0 = fabsf( x0 );
+     *       sy = takesignf( y0 ); y0 = fabsf( y0 );
+     *   
+     *       ptbl = ( p0<0.5f ? atanftbl1 : atanftbl2 );
+     *    
+     *       // Approximate atan(p)/p-1
+     *       z0 = ptbl[0].f;
+     *       z0 = ptbl[1].f + p0*z0;
+     *       z0 = ptbl[2].f + p0*z0;
+     *       z0 = ptbl[3].f + p0*z0;
+     *       z0 = ptbl[4].f + p0*z0;
+     *       z0 = ptbl[5].f + p0*z0;
+     *       z0 = ptbl[6].f + p0*z0;
+     *       z0 = ptbl[7].f + p0*z0;
+     *       z0 =        p0 + p0*z0;
+     *   
+     *       if ( x0<y0 ) z0 = pi2f.f - z0;
+     *       if ( sx    ) z0 = pif.f - z0;
+     *       if ( sy    ) z0 = -z0;
+     *   
+     *       z[blkIx*blkSize+n] = z0;
+     *     }
+     *   }
+     */
+
+    {
+      /* Input values; output value; reducted input value and its 2nd power. */
+      xtfloatx2 x0, y0, z0, z1, p0, p1;
+      /* Temporary; input value signs */
+      ae_int32x2 t, sx, sy;
+      /* Polynomial coeffs for 0.f<=p<0.5f (#1) and 0.5f<=p<=1.f (#2). */
+      xtfloatx2 cf1_0, cf1_1, cf1_2, cf1_3, cf1_4, cf1_5, cf1_6, cf1_7;
+      xtfloatx2 cf2_0, cf2_1, cf2_2, cf2_3, cf2_4, cf2_5, cf2_6, cf2_7;
+      /* Selected polynomial coeffs. */
+      xtfloatx2 cf0, cf1, cf2, cf3, cf4, cf5, cf6, cf7;
+      /* x less than y; x is negative; p is less than 0.5f. */
+      xtbool2 b_xlty, b_sx, b_lt05;
+
+      X = (xtfloatx2*)( (uintptr_t)x + blkIx*blkSize*sz_f32 );
+      Y = (xtfloatx2*)( (uintptr_t)y + blkIx*blkSize*sz_f32 );
+      Z = (xtfloatx2*)( (uintptr_t)z + blkIx*blkSize*sz_f32 );
+
+      S_rd = (xtfloatx2*)scr;
+
+      X_va = XT_LASX2PP( X );
+      Y_va = XT_LASX2PP( Y );
+      Z_va = AE_ZALIGN64();
+
+      for ( n=0; n<blkLen/2; n++ )
+      {
+        XT_LASX2IP( x0, X_va, X );
+        XT_LASX2IP( y0, Y_va, Y );
+
+        /* Keep sign of x as a boolean. */
+        sx = XT_AE_MOVINT32X2_FROMXTFLOATX2( x0 );
+        b_sx = AE_LT32( sx, AE_ZERO32() );
+
+        /* Keep y sign as a binary value. */
+        sy = XT_AE_MOVINT32X2_FROMXTFLOATX2( y0 );
+        sy = AE_SRLI32( sy, 31 );
+        sy = AE_SLLI32( sy, 31 );
+
+        x0 = XT_ABS_SX2( x0 );
+        y0 = XT_ABS_SX2( y0 );
+        b_xlty = XT_OLT_SX2( x0, y0 );
+
+        XT_LSX2IP( p0, S_rd, +2*sz_f32 );
+
+        b_lt05 = XT_OLT_SX2( p0, (xtfloatx2)0.5f );
+
+        /* Reload coeff sets on each iteration. */
+        cf1_0 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 0*sz_f32 );
+        cf1_1 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 1*sz_f32 );
+        cf1_2 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 2*sz_f32 );
+        cf1_3 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 3*sz_f32 );
+        cf1_4 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 4*sz_f32 );
+        cf1_5 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 5*sz_f32 );
+        cf1_6 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 6*sz_f32 );
+        cf1_7 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl1, 7*sz_f32 );
+
+        cf2_0 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 0*sz_f32 );
+        cf2_1 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 1*sz_f32 );
+        cf2_2 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 2*sz_f32 );
+        cf2_3 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 3*sz_f32 );
+        cf2_4 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 4*sz_f32 );
+        cf2_5 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 5*sz_f32 );
+        cf2_6 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 6*sz_f32 );
+        cf2_7 = XT_LSI( (xtfloat*)xa_nnlib_atanftbl2, 7*sz_f32 );
+
+        /* Select coeffs from sets #1, #2 by reducted input value. */
+        cf0 = cf1_0; XT_MOVF_SX2( cf0, cf2_0, b_lt05 );
+        cf1 = cf1_1; XT_MOVF_SX2( cf1, cf2_1, b_lt05 );
+        cf2 = cf1_2; XT_MOVF_SX2( cf2, cf2_2, b_lt05 );
+        cf3 = cf1_3; XT_MOVF_SX2( cf3, cf2_3, b_lt05 );
+        cf4 = cf1_4; XT_MOVF_SX2( cf4, cf2_4, b_lt05 );
+        cf5 = cf1_5; XT_MOVF_SX2( cf5, cf2_5, b_lt05 );
+        cf6 = cf1_6; XT_MOVF_SX2( cf6, cf2_6, b_lt05 );
+        cf7 = cf1_7; XT_MOVF_SX2( cf7, cf2_7, b_lt05 );
+
+        /*
+         * Compute the approximation to z(p) = atan(p)/p-1. Here we use a combination
+         * of Estrin's rule and Horner's method of polynomial evaluation to shorten the
+         * dependency path at the cost of additional multiplication.
+         */
+
+        XT_MADD_SX2( cf1, cf0, p0 ); cf0 = cf1;
+        XT_MADD_SX2( cf3, cf2, p0 ); cf1 = cf3;
+        XT_MADD_SX2( cf5, cf4, p0 ); cf2 = cf5;
+        XT_MADD_SX2( cf7, cf6, p0 ); cf3 = cf7;
+
+        p1 = XT_MUL_SX2( p0, p0 );
+
+                                    z0 = cf0;
+        XT_MADD_SX2( cf1, z0, p1 ); z0 = cf1;
+        XT_MADD_SX2( cf2, z0, p1 ); z0 = cf2;
+        XT_MADD_SX2( cf3, z0, p1 ); z0 = cf3;
+
+        XT_MADD_SX2( p0, p0, z0 ); z0 = p0;
+
+        /* if ( x0<y0 ) z0 = pi2f.f - z0; */
+        z1 = XT_SUB_SX2( pi2f.f, z0 ); XT_MOVT_SX2( z0, z1, b_xlty );
+        /* if ( sx ) z0 = pif.f - z0; */
+        z1 = XT_SUB_SX2( pif.f, z0 ); XT_MOVT_SX2( z0, z1, b_sx );
+        /* if ( sy ) z0 = -z0;*/
+        t = XT_AE_MOVINT32X2_FROMXTFLOATX2( z0 );
+        t = AE_XOR32( t, sy );
+        z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2( t );
+
+        XT_SASX2IP( z0, Z_va, Z );
+      }
+
+      XT_SASX2POSFP( Z_va, Z );
+
+      /* Deliberately process the last input value if it's even-numbered. */
+      if ( blkLen & 1 )
+      {
+        x0 = XT_LSI( (xtfloat*)X, 0 );
+        y0 = XT_LSI( (xtfloat*)Y, 0 );
+
+        /* Keep sign of x as a boolean. */
+        sx = XT_AE_MOVINT32X2_FROMXTFLOATX2( x0 );
+        b_sx = AE_LT32( sx, AE_ZERO32() );
+
+        /* Keep y sign as a binary value. */
+        sy = XT_AE_MOVINT32X2_FROMXTFLOATX2( y0 );
+        sy = AE_SRLI32( sy, 31 );
+        sy = AE_SLLI32( sy, 31 );
+
+        x0 = XT_ABS_SX2( x0 );
+        y0 = XT_ABS_SX2( y0 );
+        b_xlty = XT_OLT_SX2( x0, y0 );
+
+        p0 = XT_LSI( (xtfloat*)S_rd, 0 );
+
+        b_lt05 = XT_OLT_SX2( p0, (xtfloatx2)0.5f );
+
+        /* Select coeffs from sets #1, #2 by reducted input value. */
+        cf0 = (xtfloat)xa_nnlib_atanftbl1[0].f; XT_MOVF_SX2( cf0, xa_nnlib_atanftbl2[0].f, b_lt05 );
+        cf1 = (xtfloat)xa_nnlib_atanftbl1[1].f; XT_MOVF_SX2( cf1, xa_nnlib_atanftbl2[1].f, b_lt05 );
+        cf2 = (xtfloat)xa_nnlib_atanftbl1[2].f; XT_MOVF_SX2( cf2, xa_nnlib_atanftbl2[2].f, b_lt05 );
+        cf3 = (xtfloat)xa_nnlib_atanftbl1[3].f; XT_MOVF_SX2( cf3, xa_nnlib_atanftbl2[3].f, b_lt05 );
+        cf4 = (xtfloat)xa_nnlib_atanftbl1[4].f; XT_MOVF_SX2( cf4, xa_nnlib_atanftbl2[4].f, b_lt05 );
+        cf5 = (xtfloat)xa_nnlib_atanftbl1[5].f; XT_MOVF_SX2( cf5, xa_nnlib_atanftbl2[5].f, b_lt05 );
+        cf6 = (xtfloat)xa_nnlib_atanftbl1[6].f; XT_MOVF_SX2( cf6, xa_nnlib_atanftbl2[6].f, b_lt05 );
+        cf7 = (xtfloat)xa_nnlib_atanftbl1[7].f; XT_MOVF_SX2( cf7, xa_nnlib_atanftbl2[7].f, b_lt05 );
+
+        /*
+         * Compute the approximation to z(p) = atan(p)/p-1.
+         */
+
+        XT_MADD_SX2( cf1, cf0, p0 ); cf0 = cf1;
+        XT_MADD_SX2( cf3, cf2, p0 ); cf1 = cf3;
+        XT_MADD_SX2( cf5, cf4, p0 ); cf2 = cf5;
+        XT_MADD_SX2( cf7, cf6, p0 ); cf3 = cf7;
+
+        p1 = XT_MUL_SX2( p0, p0 );
+
+                                    z0 = cf0;
+        XT_MADD_SX2( cf1, z0, p1 ); z0 = cf1;
+        XT_MADD_SX2( cf2, z0, p1 ); z0 = cf2;
+        XT_MADD_SX2( cf3, z0, p1 ); z0 = cf3;
+
+        XT_MADD_SX2( p0, p0, z0 ); z0 = p0;
+
+        /* if ( x0<y0 ) z0 = pi2f.f - z0; */
+        z1 = XT_SUB_SX2( pi2f.f, z0 ); XT_MOVT_SX2( z0, z1, b_xlty );
+        /* if ( sx ) z0 = pif.f - z0; */
+        z1 = XT_SUB_SX2( pif.f, z0 ); XT_MOVT_SX2( z0, z1, b_sx );
+        /* if ( sy ) z0 = -z0; */
+        t = XT_AE_MOVINT32X2_FROMXTFLOATX2( z0 );
+        t = AE_XOR32( t, sy );
+        z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2( t );
+
+        XT_SSI( z0, (xtfloat*)Z, 0 );
+      }
+    }
+
+  } /* for ( blkIx=0; blkIx<blkNum; blkIx++ ) */
+
+} /* vec_atan2f() */
+
+#elif HAVE_FPU
+#define sz_f32    (int)sizeof(float32_t)
+
+/*===========================================================================
+  Scalar matematics:
+  scl_atan2          full quadrant Arctangent        
+===========================================================================*/
+/*-------------------------------------------------------------------------
+Floating-Point Full-Quadrant Arc Tangent
+The functions compute the full quadrant arc tangent of the ratio y/x. 
+Floating point functions output is in radians. Fixed point functions 
+scale its output by pi.
+
+NOTE:
+1. Scalar function is compatible with standard ANSI C routines and set 
+   errno and exception flags accordingly
+2. Scalar function assigns EDOM to errno whenever y==0 and x==0.
+
+Special cases:
+     y    |   x   |  result   |  extra conditions    
+  --------|-------|-----------|---------------------
+   +/-0   | -0    | +/-pi     |
+   +/-0   | +0    | +/-0      |
+   +/-0   |  x    | +/-pi     | x<0
+   +/-0   |  x    | +/-0      | x>0
+   y      | +/-0  | -pi/2     | y<0
+   y      | +/-0  |  pi/2     | y>0
+   +/-y   | -inf  | +/-pi     | finite y>0
+   +/-y   | +inf  | +/-0      | finite y>0
+   +/-inf | x     | +/-pi/2   | finite x
+   +/-inf | -inf  | +/-3*pi/4 | 
+   +/-inf | +inf  | +/-pi/4   |
+
+Input:
+  y[N]  input data, Q15 or floating point
+  x[N]  input data, Q15 or floating point
+  N     length of vectors
+Output:
+  z[N]  result, Q15 or floating point
+  
+Restrictions:
+x, y, z should not overlap
+---------------------------------------------------------------------------*/
+
+// Taken from Fusion
+void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WORD32 N )
+{
+  /*
+  * const union ufloat32uint32* p;
+  * int sx,sy,big;
+  * sx=takesignf(x);
+  * sy=takesignf(y);
+  * x=fabs(x);
+  * y=fabs(y);
+  * if(x==0.f && y==0.f)
+  * {
+  * // The actual result depends on input signs.
+  * x = 1.f;
+  * y = 0.f;
+  * }
+  * 
+  * big=x>y;
+  * if(big)
+  * {
+  * x=y/x;
+  * }
+  * else
+  * {
+  * // compare x==y is necessary to support (+/-Inf, +/-Inf) cases
+  * x = (x == y) ? 1.0f : x / y;
+  * }
+  * p = (x<0.5f) ? atanftbl1 : atanftbl2;
+  * // approximate atan(x)/x-1
+  * y = p[0].f;
+  * y = x*y + p[1].f;
+  * y = x*y + p[2].f;
+  * y = x*y + p[3].f;
+  * y = x*y + p[4].f;
+  * y = x*y + p[5].f;
+  * y = x*y + p[6].f;
+  * y = x*y + p[7].f;
+  * // convert result to true atan(x)
+  * y = x*y + x;
+  * 
+  * if (!big) y = pi2f.f - y;
+  * if (sx)   y = pif.f - y;
+  * if (sy)   y = -y;
+  * return   y;
+  */
+  const xtfloat * restrict X;
+  const xtfloat * restrict Y;
+        int32_t * restrict Z;
+  const xtfloat * restrict S_rd;
+        xtfloat * restrict S_wr;
+  const xtfloat * restrict POLY_TBL1;
+  const xtfloat * restrict POLY_TBL2;
+
+  /* Current block index; overall number of blocks; number of values in the current block */
+  int blkIx, blkNum, blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ / sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  float32_t ALIGN(8) scr[blkSize];
+
+  int n;
+
+  if (N <= 0) return;
+
+  NASSERT_ALIGN8(scr);
+
+  /*
+  * Data are processed in blocks of scratch area size. Further, the algorithm
+  * implementation is splitted in order to feed the optimizing compiler with a
+  * few loops of managable size.
+  */
+
+  blkNum = (N + blkSize - 1) / blkSize;
+  POLY_TBL1 = (xtfloat*)xa_nnlib_atanftbl1;
+  POLY_TBL2 = (xtfloat*)xa_nnlib_atanftbl2;
+  for (blkIx = 0; blkIx<blkNum; blkIx++)
+  {
+    blkLen = XT_MIN(N - blkIx*blkSize, blkSize);
+
+    /*
+    * Part I, reduction to [0,pi/4]. Reference C code:
+    *
+    *   {
+    *     float32_t x0, y0, p0;
+    *
+    *     for ( n=0; n<blkLen; n++ )
+    *     {
+    *       y0 = fabsf( y[blkIx*blkSize+n] );
+    *       x0 = fabsf( x[blkIx*blkSize+n] );
+    *
+    *       // The actual result depends on input signs.
+    *       if ( x0==0.f && y0==0.f ) { x0 = 1.f; y0 = 0.f; };
+    *
+    *       if ( x0>y0 ) p0 = y0/x0;
+    *       // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases.
+    *       else p0 = ( x0==y0 ? 1.f : x0/y0 );
+    *
+    *       scr[n] = p0;
+    *     }
+    *   }
+    */
+
+    {
+      /* Input values */
+      xtfloat x0, y0, i0;
+      /* Numerator; denominator; reciprocal; quotient */
+      xtfloat num, den, rcp, quo;
+      /* Auxiliary vars */
+      xtfloat s, eps;
+      /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */
+      xtbool b_nan, b_num_inf, b_den_inf, b_eqz, b_subn;
+      const xtfloat * pT;
+
+      X = (xtfloat*)((uintptr_t)x + blkIx*blkSize*sz_f32);
+      Y = (xtfloat*)((uintptr_t)y + blkIx*blkSize*sz_f32);
+      S_wr = (xtfloat*)scr;
+
+      static const uint32_t TAB[4] = { 0x7fc00000, 0x00800000,
+        0x4b000000, 0x7f800000
+      };
+      pT = (xtfloat *)TAB;
+      __Pragma("loop_count min=1");
+      for (n = 0; n<blkLen; n++)
+      {
+        XT_LSIP(x0, X, sz_f32);
+        XT_LSIP(y0, Y, sz_f32);
+
+        /* Reproduce NaN in both x and y to ensure NaN propagation. */
+        b_nan = XT_UN_S(x0, y0);
+        i0 = pT[0];
+        
+        XT_MOVT_S(x0, i0, b_nan);
+
+        x0 = XT_ABS_S(x0);
+        y0 = XT_ABS_S(y0);
+
+        /* num <= den */
+        num = XT_MIN_S(y0, x0);
+        den = XT_MAX_S(y0, x0);
+
+        /* Classify numerator and denominator. */
+        i0 = pT[1];
+        b_subn = XT_OLT_S(num, i0);
+        
+        /* Scale up numerator and denominator if BOTH are subnormal. */
+        i0 = pT[2];
+        s = XT_MUL_S(num, i0); XT_MOVT_S(num, s, b_subn);
+        s = XT_MUL_S(den, i0); XT_MOVT_S(den, s, b_subn);
+
+        /* Initial appromimation for 1/den. */
+        rcp = XT_RECIP0_S(den);
+        /* Newton-Raphson iteration for 1/den. */
+        eps = XT_CONST_S(1);
+        XT_MSUB_S(eps, rcp, den);
+        XT_MADD_S(rcp, rcp, eps);
+        /* Approximation for the quotient num/den. */
+        quo = XT_MUL_S(num, rcp);
+        /* Refine the quotient by a modified Newton-Raphson iteration. */
+        eps = num;
+        XT_MSUB_S(eps, quo, den);
+        XT_MADD_S(quo, rcp, eps);
+
+        i0 = pT[3];
+        b_num_inf = XT_OEQ_S(num, i0); /* Inf/Inf! */
+        b_den_inf = XT_OEQ_S(den, i0);
+        b_eqz = XT_OEQ_S(den, XT_CONST_S(0)); /* 0/0! */
+        b_eqz = XT_ORB(b_eqz, b_den_inf);
+
+        XT_MOVT_S(quo, XT_CONST_S(0), b_eqz);     /* 0/0 -> 0 or x/Inf -> 0*/
+        XT_MOVT_S(quo, XT_CONST_S(1), b_num_inf); /* Inf/Inf -> 1 */
+
+        XT_SSIP(quo, S_wr, sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+
+    /*
+    * Part II, polynomial approximation and full quadrant restoration.
+    * Reference C code:
+    *
+    *   {
+    *     const union ufloat32uint32 * ptbl;
+    *     float32_t x0, y0, z0, p0;
+    *     int sx, sy;
+    *
+    *     for ( n=0; n<blkLen; n++ )
+    *     {
+    *       y0 = y[blkIx*blkSize+n];
+    *       x0 = x[blkIx*blkSize+n];
+    *       p0 = scr[n];
+    *
+    *       sy = takesignf( y0 ); y0 = fabsf( y0 );
+    *       sx = takesignf( x0 ); x0 = fabsf( x0 );
+    *
+    *       ptbl = ( p0<0.5f ? atanftbl1 : atanftbl2 );
+    *
+    *       // Approximate atan(p)/p-1
+    *       z0 = ptbl[0].f;
+    *       z0 = ptbl[1].f + p0*z0;
+    *       z0 = ptbl[2].f + p0*z0;
+    *       z0 = ptbl[3].f + p0*z0;
+    *       z0 = ptbl[4].f + p0*z0;
+    *       z0 = ptbl[5].f + p0*z0;
+    *       z0 = ptbl[6].f + p0*z0;
+    *       z0 = ptbl[7].f + p0*z0;
+    *       z0 =        p0 + p0*z0;
+    *
+    *       if ( x0<y0 ) z0 = pi2f.f - z0;
+    *       if ( sx    ) z0 = pif.f - z0;
+    *       if ( sy    ) z0 = -z0;
+    *
+    *       z[blkIx*blkSize+n] = z0;
+    *     }
+    *   }
+    */
+    {
+      const xtfloat   *          pT;
+      /* Input values; output value; reducted input value*/
+      xtfloat x0, y0, z0, z1, p0;
+      /* Temporary; input values' sign */
+      int32_t sx, sy;
+      /* Polynomial coeffs for 0.f<=p<0.5f (#1) and 0.5f<=p<=1.f (#2). */
+      xtfloat cf1_0, cf1_1, cf1_2, cf1_3, cf1_4, cf1_5, cf1_6, cf1_7;
+      xtfloat cf2_0, cf2_1, cf2_2, cf2_3, cf2_4, cf2_5, cf2_6, cf2_7;
+      /* Selected polynomial coeffs. */
+      xtfloat cf0, cf1, cf2, cf3, cf4, cf5, cf6, cf7;
+      /* x less than y; x is negative; num/den is less than 0.5f. */
+      xtbool b_xlty, b_sx, b_lt05;
+
+      X = (xtfloat*)((uintptr_t)x + blkIx*blkSize*sz_f32);
+      Y = (xtfloat*)((uintptr_t)y + blkIx*blkSize*sz_f32);
+      Z = (int32_t*)((uintptr_t)z + blkIx*blkSize*sz_f32);
+
+      S_rd = (xtfloat*)scr;
+      /* pi/2, pi */
+      static const uint32_t TAB[2] = { 0x3fc90fdb, 0x40490fdb
+      };
+      pT = (xtfloat  *)TAB;
+      __Pragma("loop_count min=1");
+      for (n = 0; n<blkLen; n++)
+      {
+        xtfloat i0;
+        XT_LSIP(x0, X, 0*sz_f32);
+        XT_LSIP(y0, Y, 0*sz_f32);
+
+        x0 = XT_ABS_S(x0);
+        y0 = XT_ABS_S(y0);
+        b_xlty = XT_OLT_S(x0, y0);
+
+        XT_LSIP(p0, S_rd, sz_f32);
+
+        b_lt05 = XT_OLT_S(p0, XT_CONST_S(3));
+
+        /*Reload polynomial coeff sets. */
+
+        cf1_0 = XT_LSI(POLY_TBL1, 0 * sz_f32);
+        cf2_0 = XT_LSI(POLY_TBL2, 0 * sz_f32);
+        cf1_1 = XT_LSI(POLY_TBL1, 1 * sz_f32);
+        cf2_1 = XT_LSI(POLY_TBL2, 1 * sz_f32);
+        cf1_2 = XT_LSI(POLY_TBL1, 2 * sz_f32);
+        cf2_2 = XT_LSI(POLY_TBL2, 2 * sz_f32);
+        cf1_3 = XT_LSI(POLY_TBL1, 3 * sz_f32);
+        cf2_3 = XT_LSI(POLY_TBL2, 3 * sz_f32);
+        cf1_4 = XT_LSI(POLY_TBL1, 4 * sz_f32);
+        cf2_4 = XT_LSI(POLY_TBL2, 4 * sz_f32);
+        cf1_5 = XT_LSI(POLY_TBL1, 5 * sz_f32);
+        cf2_5 = XT_LSI(POLY_TBL2, 5 * sz_f32);
+        cf1_6 = XT_LSI(POLY_TBL1, 6 * sz_f32);
+        cf2_6 = XT_LSI(POLY_TBL2, 6 * sz_f32);
+        cf1_7 = XT_LSI(POLY_TBL1, 7 * sz_f32);
+        cf2_7 = XT_LSI(POLY_TBL2, 7 * sz_f32);
+
+        /* Select coeffs from sets #1, #2 by reducted value's magnitude. */
+        {
+          xtfloat p0, p1;
+          p0 = cf1_0;
+          p1 = cf2_0;
+          XT_MOVF_S(p0, p1, b_lt05); cf0 = p0;
+          p0 = cf1_1;
+          p1 = cf2_1;
+          XT_MOVF_S(p0, p1, b_lt05); cf1 = p0;
+          p0 = cf1_2;
+          p1 = cf2_2;
+          XT_MOVF_S(p0, p1, b_lt05); cf2 = p0;
+          p0 = cf1_3;
+          p1 = cf2_3;
+          XT_MOVF_S(p0, p1, b_lt05); cf3 = p0;
+          p0 = cf1_4;
+          p1 = cf2_4;
+          XT_MOVF_S(p0, p1, b_lt05); cf4 = p0;
+          p0 = cf1_5;
+          p1 = cf2_5;
+          XT_MOVF_S(p0, p1, b_lt05); cf5 = p0;
+          p0 = cf1_6;
+          p1 = cf2_6;
+          XT_MOVF_S(p0, p1, b_lt05); cf6 = p0;
+          p0 = cf1_7;
+          p1 = cf2_7;
+          XT_MOVF_S(p0, p1, b_lt05); cf7 = p0;
+        }
+
+        /* Compute the approximation to z(p) = tan(p)/p-1. We use
+        * Horner's method for better pipelining of a few iterations. */
+        z0 = cf0;
+        XT_MADD_S(cf1, p0, z0); z0 = cf1;
+        XT_MADD_S(cf2, p0, z0); z0 = cf2;
+        XT_MADD_S(cf3, p0, z0); z0 = cf3;
+        XT_MADD_S(cf4, p0, z0); z0 = cf4;
+        XT_MADD_S(cf5, p0, z0); z0 = cf5;
+        XT_MADD_S(cf6, p0, z0); z0 = cf6;
+        XT_MADD_S(cf7, p0, z0); z0 = cf7;
+
+        XT_MADD_S( p0, p0, z0); z0 = p0;
+
+        /* Keep signs of x and y. */
+        sx = (int32_t)((int *)X)[0]; X++;
+        sy = (int32_t)((int *)Y)[0]; Y++;
+
+		sy = sy & 0x80000000;
+
+        b_sx = AE_INT64_LT(AE_MOVINT64_FROMINT32(sx), AE_ZERO64());
+
+        /* if ( x0<y0 ) z0 = pi2f.f - z0; */
+        i0 = XT_LSI(pT, 0*sz_f32);
+        z1 = XT_SUB_S(i0, z0); XT_MOVT_S(z0, z1, b_xlty);
+        /* if ( sx ) z0 = pif.f - z0; */
+        i0 = XT_LSI(pT, 1*sz_f32);
+        z1 = XT_SUB_S(i0, z0); XT_MOVT_S(z0, z1, b_sx);
+        /* if ( sy ) z0 = -z0; */
+        sx = XT_RFR(z0);
+        sx = sx ^ sy;
+        *Z++ = sx;
+      }
+    }
+  }
+} /* vec_atan2f() */
+#endif