diff --git a/ARM_SVE_README b/ARM_SVE_README
new file mode 100644
index 00000000000..a4670d9a8df
--- /dev/null
+++ b/ARM_SVE_README
@@ -0,0 +1,30 @@
+Configuration and installation details please check script arm_install.sh
+
+Run test: (Test example takes 4 args)
+arg1 :  elements count for operation
+arg2 :  elements type could be : i (integer), f (float), d (double) 
+arg3:   type size in bits, only apply when you set arg2 to i.  eg: i 8 will be converted to int8;  i 16 to int16
+arg4:   operation type. Could be : max, min, sum , mul, band , bor,  bxor
+If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1
+=======
+Example for test
+$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+If you don't need armie you can remove the ARMIE part in the command line as : 
+$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+How we evaluate the performance?
+======
+Logical:
+
+Start_time;
+MPI_reduce_local(...);
+End_time;
+
+Reduce_time = Start_time - End_time;
+
+Possible issues (this happened on thunder2 machine):
+======
+Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works.
+
+
diff --git a/arm_install.sh b/arm_install.sh
new file mode 100644
index 00000000000..0b8afef93b0
--- /dev/null
+++ b/arm_install.sh
@@ -0,0 +1,11 @@
+mkdir build
+
+./autogen.pl >/dev/null
+
+./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null
+
+./config.status >/dev/null
+make -j 128 install >/dev/null
+
+## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c
+./build/bin/mpicc  -g  -O3 -march=armv8-a+sve  -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c
diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am
index 8c392f1dbec..e942d1a8c21 100644
--- a/ompi/mca/op/Makefile.am
+++ b/ompi/mca/op/Makefile.am
@@ -2,7 +2,7 @@
 # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 #                         University Research and Technology
 #                         Corporation.  All rights reserved.
-# Copyright (c) 2004-2005 The University of Tennessee and The University
+# Copyright (c) 2004-2020 The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
 # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -17,6 +17,8 @@
 # $HEADER$
 #
 
+AM_CPPFLAGS = $(LTDLINCL)
+
 # main library setup
 noinst_LTLIBRARIES = libmca_op.la
 libmca_op_la_SOURCES =
diff --git a/ompi/mca/op/sve/Makefile.am b/ompi/mca/op/sve/Makefile.am
new file mode 100644
index 00000000000..1e70e2fead8
--- /dev/null
+++ b/ompi/mca/op/sve/Makefile.am
@@ -0,0 +1,70 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This is an sve op component.  This Makefile.am is a typical
+# sve of how to integrate into Open MPI's Automake-based build
+# system.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = \
+    op_sve.h \
+    op_sve_component.c \
+	op_sve_functions.h \
+	op_sve_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_sve_DSO
+component_noinst =
+component_install = mca_op_sve.la
+else
+component_install =
+component_noinst = component_noinst
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_sve_la_SOURCES = $(sources)
+mca_op_sve_la_LDFLAGS = -module -avoid-version
+mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_sve_la_SOURCES = $(sources)
+libmca_op_sve_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4
new file mode 100644
index 00000000000..9f71d56c951
--- /dev/null
+++ b/ompi/mca/op/sve/configure.m4
@@ -0,0 +1,21 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_sve_CONFIG([action-if-can-compile],
+#		         [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_sve_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/sve/Makefile])
+           [$1],
+])dnl
diff --git a/ompi/mca/op/sve/op_sve.h b/ompi/mca/op/sve/op_sve.h
new file mode 100644
index 00000000000..21483d1fa56
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_SVE_EXPORT_H
+#define MCA_OP_SVE_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is sve-component-specific cached information.  We
+       tend to use this scheme (caching information on the sve
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       sves; replace them with whatever is relevant for your
+       component. */
+
+    /** A simple boolean indicating that the hardware is available. */
+    bool hardware_available;
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_sve_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *sve* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_sve_component_t
+    mca_op_sve_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_SVE_EXPORT_H */
diff --git a/ompi/mca/op/sve/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c
new file mode 100644
index 00000000000..16393aefe23
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_component.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "sve" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
+
+static int sve_component_open(void);
+static int sve_component_close(void);
+static int sve_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority);
+static int sve_component_register(void);
+
+ompi_op_sve_component_t mca_op_sve_component = {
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "sve",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = sve_component_open,
+            .mca_close_component = sve_component_close,
+            .mca_register_component_params = sve_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = sve_component_init_query,
+        .opc_op_query = sve_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int sve_component_open(void)
+{
+
+    /* A first level check to see if sve is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int sve_component_close(void)
+{
+
+    /* If sve was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int sve_component_register(void)
+{
+
+    /* Additionally, since this component is simulating hardware,
+       let's make MCA params that determine whethere a) the hardware
+       is available, and b) whether double precision floating point
+       types are supported.  This allows you to change the behavior of
+       this component at run-time (by setting these MCA params at
+       run-time), simulating different kinds of hardware. */
+    mca_op_sve_component.hardware_available = false;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "hardware_available",
+                                           "Whether the hardware is available or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.hardware_available);
+
+    mca_op_sve_component.double_supported = true;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.double_supported);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int sve_component_init_query(bool enable_progress_threads,
+                                        bool enable_mpi_thread_multiple)
+{
+    if (mca_op_sve_component.hardware_available) {
+        return OMPI_SUCCESS;
+    }
+    return OMPI_ERR_NOT_SUPPORTED;
+}
+
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    int i=0;
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+        module= NULL;
+        break;
+    default:
+        module= NULL;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *sve* component pointer
+       (vs. a *base* component pointer -- where an *sve* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c
new file mode 100644
index 00000000000..4e86f5b2471
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_functions.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ */
+#define OP_SVE_FUNC(name, type_name, type_size, type, op) \
+    static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    svbool_t Pg = svptrue_b##type_size(); \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in);                                \
+        sv##type  vdst = svld1(Pg, out);                               \
+        in += types_per_step;                                          \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
+    }                                                                  \
+    \
+    while( left_over > 0 ) {                        \
+        int how_much = (left_over > 8) ? 8 : left_over; \
+        switch(left_over) {                         \
+            case 8: out[7] = current_func(out[7],in[7]) ;                        \
+            case 7: out[6] = current_func(out[6],in[6]) ;                        \
+            case 6: out[5] = current_func(out[5],in[5]) ;                        \
+            case 5: out[4] = current_func(out[4],in[4]) ;                        \
+            case 4: out[3] = current_func(out[3],in[3]) ;                        \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+        left_over -= how_much;                 \
+        out += how_much;                       \
+        in += how_much;                        \
+    } \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_SVE_FUNC(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC(max, d, 64, uint64_t, max)
+
+    OP_SVE_FUNC(max, w, 32, float32_t, max)
+    OP_SVE_FUNC(max, d, 64, float64_t, max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_SVE_FUNC(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC(min, d, 64, uint64_t, min)
+
+    OP_SVE_FUNC(min, w, 32, float32_t, min)
+    OP_SVE_FUNC(min, d, 64, float64_t, min)
+
+ /*************************************************************************
+ * Sum
+ ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_SVE_FUNC(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC(sum, d, 64, uint64_t, add)
+
+    OP_SVE_FUNC(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC(sum, d, 64, float64_t, add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_SVE_FUNC(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC(prod, d,  64,  int64_t, mul)
+    OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
+
+    OP_SVE_FUNC(prod, w, 32, float32_t, mul)
+    OP_SVE_FUNC(prod, d, 64, float64_t, mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_SVE_FUNC(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC(band, d,  64,  int64_t, and)
+OP_SVE_FUNC(band, d, 64, uint64_t, and)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_SVE_FUNC(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC(bor, d,  64,  int64_t, orr)
+OP_SVE_FUNC(bor, d, 64, uint64_t, orr)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_SVE_FUNC(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC(bxor, d,  64,  int64_t, eor)
+OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \
+        static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    svbool_t Pg = svptrue_b##type_size(); \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
+        in1 += types_per_step; \
+        in2 += types_per_step; \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
+    }                                                                  \
+    if (left_over !=0){                                                \
+        Pg = svwhilelt_b##type_size##_u64(0, left_over);               \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+    } \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC_3BUFF(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC_3BUFF(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC_3BUFF(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max)
+
+    OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC_3BUFF(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC_3BUFF(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC_3BUFF(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min)
+
+    OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min)
+
+ /*************************************************************************
+ * Sum
+ ************************************************************************/
+    OP_SVE_FUNC_3BUFF(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add)
+
+    OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d,  64,  int64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul)
+
+    OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC_3BUFF(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC_3BUFF(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC_3BUFF(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC_3BUFF(band, d,  64,  int64_t, and)
+    OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d,  64,  int64_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d,  64,  int64_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                              \
+    [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t,     \
+    [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t,   \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t
+
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t
+#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t
+
+#define FLOATING_POINT(name, ftype)                                        \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                        \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(sum, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 2buff),
+        FLOATING_POINT(prod, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(sum, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 3buff),
+        FLOATING_POINT(prod, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/sve/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h
new file mode 100644
index 00000000000..00db651cfac
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_functions.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/sve/op_sve.h"
+
+BEGIN_C_DECLS
+
+OMPI_DECLSPEC extern ompi_op_base_handler_fn_t
+ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t
+ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt
new file mode 100644
index 00000000000..4d6025c95ee
--- /dev/null
+++ b/test/datatype/HOW_TO_TEST_README.txt
@@ -0,0 +1,17 @@
+(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8.
+    compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c 
+(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size. 
+(3) SVE_MPI_Op.py: A python script which you can use to generate plot.
+
+ALL YOU NEED TO DO IS:
+(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. 
+(2) Run sve_uint8_test.sh 
+ This will generate 3 types of output files with names
+    a. sve-sum-vectorlength.txt    ##MPI_SUM with SVE-enabled operation
+    b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation
+    c. sve-cpy-vectorlength.txt    ## memcpy 
+(2) run python scripts as
+    $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt
+    make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt 
+    example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt
+
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 4366724a523..19c0d6489b3 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -16,7 +16,7 @@
 
 if PROJECT_OMPI
     MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
-    MPI_CHECKS = to_self
+    MPI_CHECKS = to_self reduce_local
 endif
 TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
 
@@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
 unpack_hetero_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
+reduce_local_SOURCES = reduce_local.c
+reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
+reduce_local_LDADD = \
+					 $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
+					 $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+
 distclean:
 	rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
new file mode 100644
index 00000000000..a9546ebcf1d
--- /dev/null
+++ b/test/datatype/Reduce_local_float.c
@@ -0,0 +1,661 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  1024*1024
+
+float in_float[ARRAYSIZE];
+float inout_float[ARRAYSIZE];
+float inout_float_for_check[ARRAYSIZE];
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
+
+uint16_t in_uint16[ARRAYSIZE];
+uint16_t inout_uint16[ARRAYSIZE];
+uint16_t inout_uint16_for_check[ARRAYSIZE];
+
+uint32_t in_uint32[ARRAYSIZE];
+uint32_t inout_uint32[ARRAYSIZE];
+uint32_t inout_uint32_for_check[ARRAYSIZE];
+
+uint64_t in_uint64[ARRAYSIZE];
+uint64_t inout_uint64[ARRAYSIZE];
+uint64_t inout_uint64_for_check[ARRAYSIZE];
+
+double in_double[ARRAYSIZE];
+double inout_double[ARRAYSIZE];
+double inout_double_for_check[ARRAYSIZE];
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+        in_float[i] = 1000.0+1;
+        inout_float[i] = inout_float_for_check[i] = 100.0+2;
+
+        in_double[i] = 10.0+1;
+        inout_double[i] = inout_double_for_check[i] = 1.0+2;
+
+        in_uint8[i] = 5;
+        inout_uint8[i] = inout_uint8_for_check[i] = 3;
+
+        in_uint16[i] = 5;
+        inout_uint16[i] = inout_uint16_for_check[i] = 3;
+
+        in_uint32[i] = 5;
+        inout_uint32[i] = inout_uint32_for_check[i] = 3;
+
+        in_uint64[i] = 5;
+        inout_uint64[i] = inout_uint64_for_check[i] = 3;
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    int correctness=1;
+
+    if(*type=='i') {
+        if(strcmp(op, "sum") == 0){
+            printf("#Local Reduce SUM: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] + inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] + inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] + inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] + inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 SUM check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            printf("#Local Reduce MAX: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MAX check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+        //intentionly reversed in and out
+        if(strcmp(op, "min") == 0) {
+            printf("#Local Reduce MIN: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(inout_uint8,in_uint8,count, MPI_INT8_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(inout_uint16,in_uint16,count, MPI_INT16_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(inout_uint32,in_uint32,count, MPI_UINT32_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(inout_uint64,in_uint64,count, MPI_UINT64_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MIN check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!= (in_uint8[i] | inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BOR check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bxor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] ^ inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] ^ inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] ^ inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=(in_uint64[i] ^ inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BXOR check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+
+
+        if(strcmp(op, "mul") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] * inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] * inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] * inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] * inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 PROD check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "band") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+            {
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] & inout_uint8_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] & inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] & inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!= (in_uint64[i] & inout_uint64_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BAND check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
+        }
+    }
+
+    if(*type=='f') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=inout_float_for_check[i]+in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Float Sum check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Max check \033[1;32m success!\033[0m");
+            else
+                printf("Float Max check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Min check \033[1;32m success!\033[0m");
+            else
+                printf("Float Min check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i] * inout_float_for_check[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Prod check \033[1;32m success!\033[0m");
+            else
+                printf("Float Prod check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+    }
+
+    if(*type=='d') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]+in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Double Sum check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Max check \033[1;32m success!\033[0m");
+            else
+                printf("Double Max check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_double,in_double,count, MPI_DOUBLE, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Min check \033[1;32m success!\033[0m");
+            else
+                printf("Double Min check \033[1;31m fail\033[0m!");
+            tend = MPI_Wtime();
+        }
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]*in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Prod check \033[1;32m success!\033[0m");
+            else
+                 printf("Double Prod check \033[1;31m fail\033[0m!");
+             tend = MPI_Wtime();
+         }
+    }
+    /*
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_float[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out8: %d", inout_uint8[i]);
+       if((i+1)%64==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out16: %d",inout_uint16[i]);
+       if((i+1)%32==0)
+       printf("\n");
+       }
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out32: %d",inout_uint32[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out64: %d",inout_uint64[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" float: %f", inout_double[i]);
+        if((i+1)%8==0)
+            printf("\n");
+    }
+*/
+    printf("\n#Local Reduce time %.6f seconds\n", tend-tstart);
+    MPI_Finalize();
+    return 0;
+}
+
diff --git a/test/datatype/Reduce_uint8.c b/test/datatype/Reduce_uint8.c
new file mode 100644
index 00000000000..35570d8fe23
--- /dev/null
+++ b/test/datatype/Reduce_uint8.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  32*1024*1024
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+
+        in_uint8[i] = 5;
+        inout_uint8[i] = inout_uint8_for_check[i] = -3;
+
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    int correctness=1;
+
+    tstart = MPI_Wtime();
+    if(strcmp(op, "sum") == 0){
+        MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+    }
+    else
+        memcpy(in_uint8,inout_uint8, count);
+    tend = MPI_Wtime();
+    printf("PERF count  %d  time %.6f seconds\n",count, tend-tstart);
+    MPI_Finalize();
+#define L1size sysconf(_SC_LEVEL1_DCACHE_SIZE)
+#define L2size sysconf(_SC_LEVEL2_CACHE_SIZE)
+#define L3size sysconf(_SC_LEVEL2_CACHE_SIZE)
+    char *cache = (char*)calloc(L1size+L2size+L3size, sizeof(char));
+    free(cache);
+    return 0;
+}
diff --git a/test/datatype/SVE_MPI_Op.py b/test/datatype/SVE_MPI_Op.py
new file mode 100644
index 00000000000..5051936e29d
--- /dev/null
+++ b/test/datatype/SVE_MPI_Op.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+
+print("\n \n Processing first file \n")
+
+filename1 = sys.argv[1]
+f1 = open(filename1,"r")
+lines1 = f1.readlines()
+f1.close()
+
+noMin1 = {}
+sortednomin1 = {}
+
+#Select lines with key
+for line in lines1:
+   if "time" in line:
+        vals = line.split( )
+        noMin1.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin1=sorted(noMin1.items())
+num_of_elements1 = [x[0] for x in sortednomin1]
+Reduce_latency1 = [x[1] for x in sortednomin1]
+print(Reduce_latency1)
+
+print("\n \n Processing second file \n")
+
+
+filename2 = sys.argv[2]
+f2 = open(filename2,"r")
+lines2 = f2.readlines()
+f2.close()
+
+noMin2 = {}
+sortednomin2 = {}
+
+
+#Select lines with key
+for line in lines2:
+   if "time" in line:
+        vals = line.split( )
+        noMin2.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin2=sorted(noMin2.items())
+num_of_elements2 = [x[0] for x in sortednomin2]
+Reduce_latency2 = [x[1] for x in sortednomin2]
+print(Reduce_latency2)
+
+Msg=[1,2,3,4,5,6,7,8,9]
+
+Msg_size=['1k', '4k', '16k', '64k', '256k', '1M', '4M', '16M', '32M']
+
+meanlineprops = dict(linewidth=2)
+fig, (ax1) = plt.subplots(1, figsize=(6, 4))
+
+ax1.grid(True,linestyle='--')
+
+
+bp1 =ax1.boxplot(Reduce_latency1, notch=1, sym='+', vert=1, whis=1.5,patch_artist=True)
+bp2 =ax1.boxplot(Reduce_latency2, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#bp3 =ax1.boxplot(cpy, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#ax1.plot(Msg, cpy, linestyle='-.', lw=2.5, marker='o',markersize='8', color='b', label="0.1s")
+
+colors = ['red','red','red','red','red','red','red','red','red']
+for patch, color in zip(bp2['boxes'], colors):
+    patch.set_facecolor(color)
+
+colors = ['purple','purple','purple','purple','purple','purple','purple','purple','purple']
+for patch, color in zip(bp1['boxes'], colors):
+    patch.set_facecolor(color)
+
+"""
+colors = ['green','green','green','green','green','green']
+for patch, color in zip(bp3['boxes'], colors):
+    patch.set_facecolor(color)
+"""
+fig.suptitle('MPI_Op: mul; MPI_type: uint8 \n(Configured with optimized, Flushed cache)', x=0.45, y=1.01, va='center',fontsize=15)
+fig.text(-0.03, 0.5, 'Reduce time (s)', va='center', rotation='vertical',fontsize=12)
+plt.xlabel('Buffer size (Bytes)',fontsize=12)
+
+ax1.set_xticks(Msg)
+ax1.set_xticklabels(Msg_size, minor=False)
+
+
+ax1.legend([bp2["boxes"][0], bp1["boxes"][0]], ['NO_SVE','SVE'], loc='upper left',fontsize=12)#, bbox_to_anchor=(1, 0.5),fontsize=15)
+
+plt.yscale('log')
+
+plt.savefig(filename1+"_sum.pdf",dpi=300,bbox_inches='tight')
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
new file mode 100644
index 00000000000..cf9f09a801e
--- /dev/null
+++ b/test/datatype/correctness_check.sh
@@ -0,0 +1,59 @@
+echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1  -np 1 /your_test_path/Reduce_local_float 1048576  i 8 max"
+
+Orange= "\033[0;33m"
+Blue= '\033[0;34m'
+Purple= '\033[0;35m'
+
+NC='\033[0m'
+
+# test all vector size
+for vector_len in  128 256 512 1024 2048 
+do
+
+    echo "=========Integer type all operations & all sizes========"
+    echo ""
+    for op in max min sum mul band bor bxor
+    do
+        echo ""
+        echo "===Operation  $op test==="
+        for size in 8 16 32 64
+        do
+            echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1024 i $size $op 
+            echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1040 i $size $op 
+        done
+    done
+
+    echo "=======Float type all operations========"
+    echo ""
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op 
+    done
+
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 f 32 $op
+    done
+
+
+    echo "========Double type all operations========="
+    echo ""
+
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op
+    done
+
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 d 64 $op
+    done
+done
+##echo -e "Test \e[1;35m  duff device code \e[m  2048*N <  Total_num_bits < 2048*N + 256  "
diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
new file mode 100644
index 00000000000..b2fe9b352c3
--- /dev/null
+++ b/test/datatype/reduce_local.c
@@ -0,0 +1,1149 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/runtime/mpiruntime.h"
+#include "ompi/datatype/ompi_datatype.h"
+
+static void print_status(char* op, char* type, int correctness)
+{
+    if(correctness)
+        printf("%s %s [\033[1;32msuccess\033[0m]", op, type);
+    else
+        printf("%s %s [\033[1;31mfail\033[0m]", op, type);
+}
+
+int main(int argc, char **argv)
+{
+    static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
+    int count, elem_size, rank, size, len, provided, correctness = 1, i;
+    double tstart, tend;
+    char *type, *op;
+
+    if(argc < 4 ) {
+        fprintf(stderr,
+                "Less arguments than expected (we need at least 3): <count> <type> <element size> <op>\n"
+                "  <type> : [i, u, f, d]\n"
+                "  <op> ; [sum, max, min, bor, bxor, mul, band]\n");
+        exit(-1);
+    }
+    count = atoi(argv[1]);
+    type = argv[2];
+    elem_size = atoi(argv[3]);
+    op = argv[4];
+
+    if( count <= 0 ) {
+        printf("The number of elements should be positive\n");
+        exit(-1);
+    }
+    if( (0 != (elem_size%8)) || (elem_size <= 0) || (elem_size > 64) ) {
+        printf("The element type should be 8, 16, 32 or 64\n");
+        exit(-2);
+    }
+
+    in_buf          = malloc(count * sizeof(double));
+    inout_buf       = malloc(count * sizeof(double));
+    inout_check_buf = malloc(count * sizeof(double));
+
+    ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
+
+    rank = ompi_comm_rank(MPI_COMM_WORLD);
+    size = ompi_comm_size(MPI_COMM_WORLD);
+
+    if(*type=='i') {
+        if( 8 == elem_size ) {
+            int8_t *in_int8 = (int8_t*)in_buf,
+                *inout_int8 = (int8_t*)inout_buf,
+                *inout_int8_for_check = (int8_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int8[i] = 5;
+                inout_int8[i] = inout_int8_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (int8_t)(in_int8[i] + inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != in_int8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int8, in_int8, count, MPI_INT8_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != in_int8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] | inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] ^ inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8,inout_int8,count, MPI_INT8_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (int8_t)(in_int8[i] * inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] & inout_int8_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT8_T", correctness);
+            }
+        }
+        if( 16 == elem_size ) {
+            int16_t *in_int16 = (int16_t*)in_buf,
+                *inout_int16 = (int16_t*)inout_buf,
+                *inout_int16_for_check = (int16_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int16[i] = 5;
+                inout_int16[i] = inout_int16_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (int16_t)(in_int16[i] + inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ )  {
+                    if(inout_int16[i] != in_int16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int16, in_int16, count, MPI_INT16_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != in_int16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] | inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] ^ inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (int16_t)(in_int16[i] * inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] & inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT16_T", correctness);
+            }
+        }
+        if( 32 == elem_size ) {
+            int32_t *in_int32 = (int32_t*)in_buf,
+                *inout_int32 = (int32_t*)inout_buf,
+                *inout_int32_for_check = (int32_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int32[i] = 5;
+                inout_int32[i] = inout_int32_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (int32_t)(in_int32[i] + inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != in_int32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int32, in_int32, count, MPI_INT32_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != in_int32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] | inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (int32_t)(in_int32[i] * inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] & inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] ^ inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT32_T", correctness);
+            }
+        }
+        if( 64 == elem_size ) {
+            int64_t *in_int64 = (int64_t*)in_buf,
+                *inout_int64 = (int64_t*)inout_buf,
+                *inout_int64_for_check = (int64_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int64[i] = 5;
+                inout_int64[i] = inout_int64_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (int64_t)(in_int64[i] + inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != in_int64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != in_int64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] | inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] ^ inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64,inout_int64,count, MPI_INT64_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (int64_t)(in_int64[i] * inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] & inout_int64_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT64_T", correctness);
+            }
+        }
+    }
+    if(*type=='u') {
+        if( 8 == elem_size ) {
+            uint8_t *in_uint8 = (uint8_t*)in_buf,
+                *inout_uint8 = (uint8_t*)inout_buf,
+                *inout_uint8_for_check = (uint8_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint8[i] = 5;
+                inout_uint8[i] = inout_uint8_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (uint8_t)(in_uint8[i] + inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != inout_uint8_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != in_uint8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] | inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] ^ inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_UINT8_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (int8_t)(in_uint8[i] * inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] & inout_uint8_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT8_T", correctness);
+            }
+        }
+        if( 16 == elem_size ) {
+            uint16_t *in_uint16 = (uint16_t*)in_buf,
+                *inout_uint16 = (uint16_t*)inout_buf,
+                *inout_uint16_for_check = (uint16_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint16[i] = 5;
+                inout_uint16[i] = inout_uint16_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (uint16_t)(in_uint16[i] + inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ )  {
+                    if(inout_uint16[i] != inout_uint16_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != in_uint16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] ^ inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (uint16_t)(in_uint16[i] * inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] & inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT16_T", correctness);
+            }
+        }
+        if( 32 == elem_size ) {
+            uint32_t *in_uint32 = (uint32_t*)in_buf,
+                *inout_uint32 = (uint32_t*)inout_buf,
+                *inout_uint32_for_check = (uint32_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint32[i] = 5;
+                inout_uint32[i] = inout_uint32_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (uint32_t)(in_uint32[i] + inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != inout_uint32_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != in_uint32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (uint32_t)(in_uint32[i] * inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] & inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] ^ inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT32_T", correctness);
+            }
+        }
+        if( 64 == elem_size ) {
+            int64_t *in_uint64 = (int64_t*)in_buf,
+                *inout_uint64 = (int64_t*)inout_buf,
+                *inout_uint64_for_check = (int64_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint64[i] = 5;
+                inout_uint64[i] = inout_uint64_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (int64_t)(in_uint64[i] + inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != in_uint64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint64, in_uint64, count, MPI_UINT64_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != in_uint64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] ^ inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (int64_t)(in_uint64[i] * inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] & inout_uint64_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT64_T", correctness);
+            }
+        }
+    }
+
+    if(*type=='f') {
+        float *in_float = (float*)in_buf,
+            *inout_float = (float*)inout_buf,
+            *inout_float_for_check = (float*)inout_check_buf;
+        for( i = 0; i < count; i++ ) {
+            in_float[i] = 1000.0+1;
+            inout_float[i] = inout_float_for_check[i] = 100.0+2;
+        }
+        if( 0 == strcmp(op, "sum") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != inout_float_for_check[i]+in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_SUM", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "max") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MAX", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "min") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MIN", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "mul") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i] * inout_float_for_check[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_PROD", "MPI_FLOAT", correctness);
+        }
+    }
+
+    if(*type=='d') {
+        double *in_double = (double*)in_buf,
+            *inout_double = (double*)inout_buf,
+            *inout_double_for_check = (double*)inout_check_buf;
+        for( i = 0; i < count; i++ ) {
+            in_double[i] = 10.0+1;
+            inout_double[i] = inout_double_for_check[i] = 1.0+2;
+        }
+
+        if( 0 == strcmp(op, "sum") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != inout_double_for_check[i]+in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_SUM", "MPI_DOUBLE", correctness);
+        }
+
+        if( 0 == strcmp(op, "max") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MAX", "MPI_DOUBLE", correctness);
+        }
+
+        if( 0 == strcmp(op, "min") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_double, in_double, count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MIN", "MPI_DOUBLE", correctness);
+        }
+        if( 0 == strcmp(op, "mul") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_PROD);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != inout_double_for_check[i]*in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_PROD", "MPI_DOUBLE", correctness);
+        }
+    }
+    //tstart = MPI_Wtime(); 
+    //memcpy(in_uint8,inout_uint8, count);
+    //memcpy(in_float, inout_float, count);
+    //memcpy(in_double, inout_double, count);
+    printf(" count  %d  time %.6f seconds\n",count, tend-tstart);
+    ompi_mpi_finalize();
+
+    free(in_buf);
+    free(inout_buf);
+    free(inout_check_buf);
+
+    return correctness ? 0 : -1;
+}
+
diff --git a/test/datatype/sve_uint8_test.sh b/test/datatype/sve_uint8_test.sh
new file mode 100644
index 00000000000..ed0cc151592
--- /dev/null
+++ b/test/datatype/sve_uint8_test.sh
@@ -0,0 +1,21 @@
+echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1  -np 1 /your_test_path/Reduce_local_float 1048576  i 8 max"
+
+# test all vector size
+for vector_len in  128 256 512 1024 2048
+do
+
+    echo "=========Integer type all operations & all sizes========"
+    echo ""
+    echo ""
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for (( i=1; i<31; i++ ))
+    do
+        for val in 1024 4096 16384 65536 262144 1048576 4194304 16777216 33554432
+        do
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 sum |  tee -a sve-sum-$vector_len.txt
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 0 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 sum |   tee -a no-sve-sum-$vector_len.txt
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 cpy |  tee -a sve-cpy-$vector_len.txt 
+        done
+    done
+done