diff --git a/ARM_SVE_README b/ARM_SVE_README new file mode 100644 index 00000000000..a4670d9a8df --- /dev/null +++ b/ARM_SVE_README @@ -0,0 +1,30 @@ +Configuration and installation details please check script arm_install.sh + +Run test: (Test example takes 4 args) +arg1 : elements count for operation +arg2 : elements type could be : i (integer), f (float), d (double) +arg3: type size in bits, only apply when you set arg2 to i. eg: i 8 will be converted to int8; i 16 to int16 +arg4: operation type. Could be : max, min, sum , mul, band , bor, bxor +If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1 +======= +Example for test +$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min + +If you don't need armie you can remove the ARMIE part in the command line as : +$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min + +How we evaluate the performance? +====== +Logical: + +Start_time; +MPI_reduce_local(...); +End_time; + +Reduce_time = Start_time - End_time; + +Possible issues (this happened on thunder2 machine): +====== +Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works. + + diff --git a/arm_install.sh b/arm_install.sh new file mode 100644 index 00000000000..0b8afef93b0 --- /dev/null +++ b/arm_install.sh @@ -0,0 +1,11 @@ +mkdir build + +./autogen.pl >/dev/null + +./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null + +./config.status >/dev/null +make -j 128 install >/dev/null + +## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c +./build/bin/mpicc -g -O3 -march=armv8-a+sve -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am index 8c392f1dbec..e942d1a8c21 100644 --- a/ompi/mca/op/Makefile.am +++ b/ompi/mca/op/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -17,6 +17,8 @@ # $HEADER$ # +AM_CPPFLAGS = $(LTDLINCL) + # main library setup noinst_LTLIBRARIES = libmca_op.la libmca_op_la_SOURCES = diff --git a/ompi/mca/op/sve/Makefile.am b/ompi/mca/op/sve/Makefile.am new file mode 100644 index 00000000000..1e70e2fead8 --- /dev/null +++ b/ompi/mca/op/sve/Makefile.am @@ -0,0 +1,70 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is an sve op component. This Makefile.am is a typical +# sve of how to integrate into Open MPI's Automake-based build +# system. +# +# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent +# for more details on how to make Open MPI components. + +# First, list all .h and .c sources. It is necessary to list all .h +# files so that they will be picked up in the distribution tarball. + +sources = \ + op_sve.h \ + op_sve_component.c \ + op_sve_functions.h \ + op_sve_functions.c + +# Open MPI components can be compiled two ways: +# +# 1. As a standalone dynamic shared object (DSO), sometimes called a +# dynamically loadable library (DLL). +# +# 2. As a static library that is slurped up into the upper-level +# libmpi library (regardless of whether libmpi is a static or dynamic +# library). This is called a "Libtool convenience library". +# +# The component needs to create an output library in this top-level +# component directory, and named either mca__.la (for DSO +# builds) or libmca__.la (for static builds). The OMPI +# build system will have set the +# MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate +# which way this component should be built. + +if MCA_BUILD_ompi_op_sve_DSO +component_noinst = +component_install = mca_op_sve.la +else +component_install = +component_noinst = component_noinst +endif + +# Specific information for DSO builds. +# +# The DSO should install itself in $(ompilibdir) (by default, +# $prefix/lib/openmpi). + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_op_sve_la_SOURCES = $(sources) +mca_op_sve_la_LDFLAGS = -module -avoid-version +mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + +# Specific information for static builds. +# +# Note that we *must* "noinst"; the upper-layer Makefile.am's will +# slurp in the resulting .la library into libmpi. + +noinst_LTLIBRARIES = $(component_noinst) +libmca_op_sve_la_SOURCES = $(sources) +libmca_op_sve_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4 new file mode 100644 index 00000000000..9f71d56c951 --- /dev/null +++ b/ompi/mca/op/sve/configure.m4 @@ -0,0 +1,21 @@ +# -*- shell-script -*- +# +# Copyright (c) 2019-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_sve_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# We can always build, unless we were explicitly disabled. +AC_DEFUN([MCA_ompi_op_sve_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/sve/Makefile]) + [$1], +])dnl diff --git a/ompi/mca/op/sve/op_sve.h b/ompi/mca/op/sve/op_sve.h new file mode 100644 index 00000000000..21483d1fa56 --- /dev/null +++ b/ompi/mca/op/sve/op_sve.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OP_SVE_EXPORT_H +#define MCA_OP_SVE_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/class/opal_object.h" + +#include "ompi/mca/op/op.h" + +BEGIN_C_DECLS + +/** + * Derive a struct from the base op component struct, allowing us to + * cache some component-specific information on our well-known + * component struct. + */ +typedef struct { + /** The base op component struct */ + ompi_op_base_component_1_0_0_t super; + + /* What follows is sve-component-specific cached information. We + tend to use this scheme (caching information on the sve + component itself) instead of lots of individual global + variables for the component. The following data fields are + sves; replace them with whatever is relevant for your + component. */ + + /** A simple boolean indicating that the hardware is available. */ + bool hardware_available; + + /** A simple boolean indicating whether double precision is + supported. */ + bool double_supported; +} ompi_op_sve_component_t; + +/** + * Globally exported variable. Note that it is a *sve* component + * (defined above), which has the ompi_op_base_component_t as its + * first member. Hence, the MCA/op framework will find the data that + * it expects in the first memory locations, but then the component + * itself can cache additional information after that that can be used + * by both the component and modules. + */ +OMPI_DECLSPEC extern ompi_op_sve_component_t + mca_op_sve_component; + +END_C_DECLS + +#endif /* MCA_OP_SVE_EXPORT_H */ diff --git a/ompi/mca/op/sve/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c new file mode 100644 index 00000000000..16393aefe23 --- /dev/null +++ b/ompi/mca/op/sve/op_sve_component.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * This is the "sve" component source code. + * + */ + +#include "ompi_config.h" + +#include "opal/util/printf.h" + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" + +static int sve_component_open(void); +static int sve_component_close(void); +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority); +static int sve_component_register(void); + +ompi_op_sve_component_t mca_op_sve_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "sve", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = sve_component_open, + .mca_close_component = sve_component_close, + .mca_register_component_params = sve_component_register, + }, + .opc_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = sve_component_init_query, + .opc_op_query = sve_component_op_query, + }, +}; + +/* + * Component open + */ +static int sve_component_open(void) +{ + + /* A first level check to see if sve is even available in this + process. E.g., you may want to do a first-order check to see + if hardware is available. If so, return OMPI_SUCCESS. If not, + return anything other than OMPI_SUCCESS and the component will + silently be ignored. + + Note that if this function returns non-OMPI_SUCCESS, then this + component won't even be shown in ompi_info output (which is + probably not what you want). + */ + + return OMPI_SUCCESS; +} + +/* + * Component close + */ +static int sve_component_close(void) +{ + + /* If sve was opened successfully, close it (i.e., release any + resources that may have been allocated on this component). + Note that _component_close() will always be called at the end + of the process, so it may have been after any/all of the other + component functions have been invoked (and possibly even after + modules have been created and/or destroyed). */ + + return OMPI_SUCCESS; +} + +/* + * Register MCA params. + */ +static int sve_component_register(void) +{ + + /* Additionally, since this component is simulating hardware, + let's make MCA params that determine whethere a) the hardware + is available, and b) whether double precision floating point + types are supported. This allows you to change the behavior of + this component at run-time (by setting these MCA params at + run-time), simulating different kinds of hardware. */ + mca_op_sve_component.hardware_available = false; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "hardware_available", + "Whether the hardware is available or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.hardware_available); + + mca_op_sve_component.double_supported = true; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "double_supported", + "Whether the double precision data types are supported or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.double_supported); + + return OMPI_SUCCESS; +} + +/* + * Query whether this component wants to be used in this process. + */ +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + if (mca_op_sve_component.hardware_available) { + return OMPI_SUCCESS; + } + return OMPI_ERR_NOT_SUPPORTED; +} + + +/* + * Query whether this component can be used for a specific op + */ +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority) +{ + ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t); + /* Sanity check -- although the framework should never invoke the + _component_op_query() on non-intrinsic MPI_Op's, we'll put a + check here just to be sure. */ + if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) { + return NULL; + } + + int i=0; + switch (op->o_f_to_c_index) { + case OMPI_OP_BASE_FORTRAN_MAX: + case OMPI_OP_BASE_FORTRAN_MIN: + case OMPI_OP_BASE_FORTRAN_SUM: + case OMPI_OP_BASE_FORTRAN_PROD: + case OMPI_OP_BASE_FORTRAN_BOR: + case OMPI_OP_BASE_FORTRAN_BAND: + case OMPI_OP_BASE_FORTRAN_BXOR: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_LAND: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LXOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MAXLOC: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MINLOC: + module= NULL; + break; + default: + module= NULL; + } + /* If we got a module from above, we'll return it. Otherwise, + we'll return NULL, indicating that this component does not want + to be considered for selection for this MPI_Op. Note that the + functions each returned a *sve* component pointer + (vs. a *base* component pointer -- where an *sve* component + is a base component plus some other module-specific cached + information), so we have to cast it to the right pointer type + before returning. */ + if (NULL != module) { + *priority = 50; + } + return (ompi_op_base_module_1_0_0_t *) module; +} diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c new file mode 100644 index 00000000000..4e86f5b2471 --- /dev/null +++ b/ompi/mca/op/sve/op_sve_functions.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include "opal/util/output.h" + +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +/* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation + * in all functions that use this macro is the same. + * + * This macro is for (out op in). + * + */ +#define OP_SVE_FUNC(name, type_name, type_size, type, op) \ + static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ + svbool_t Pg = svptrue_b##type_size(); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in); \ + sv##type vdst = svld1(Pg, out); \ + in += types_per_step; \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ + } \ + \ + while( left_over > 0 ) { \ + int how_much = (left_over > 8) ? 8 : left_over; \ + switch(left_over) { \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + left_over -= how_much; \ + out += how_much; \ + in += how_much; \ + } \ +} + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + OP_SVE_FUNC(max, b, 8, int8_t, max) + OP_SVE_FUNC(max, b, 8, uint8_t, max) + OP_SVE_FUNC(max, h, 16, int16_t, max) + OP_SVE_FUNC(max, h, 16, uint16_t, max) + OP_SVE_FUNC(max, w, 32, int32_t, max) + OP_SVE_FUNC(max, w, 32, uint32_t, max) + OP_SVE_FUNC(max, d, 64, int64_t, max) + OP_SVE_FUNC(max, d, 64, uint64_t, max) + + OP_SVE_FUNC(max, w, 32, float32_t, max) + OP_SVE_FUNC(max, d, 64, float64_t, max) + +/************************************************************************* + * Min + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_SVE_FUNC(min, b, 8, int8_t, min) + OP_SVE_FUNC(min, b, 8, uint8_t, min) + OP_SVE_FUNC(min, h, 16, int16_t, min) + OP_SVE_FUNC(min, h, 16, uint16_t, min) + OP_SVE_FUNC(min, w, 32, int32_t, min) + OP_SVE_FUNC(min, w, 32, uint32_t, min) + OP_SVE_FUNC(min, d, 64, int64_t, min) + OP_SVE_FUNC(min, d, 64, uint64_t, min) + + OP_SVE_FUNC(min, w, 32, float32_t, min) + OP_SVE_FUNC(min, d, 64, float64_t, min) + + /************************************************************************* + * Sum + ************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + OP_SVE_FUNC(sum, b, 8, int8_t, add) + OP_SVE_FUNC(sum, b, 8, uint8_t, add) + OP_SVE_FUNC(sum, h, 16, int16_t, add) + OP_SVE_FUNC(sum, h, 16, uint16_t, add) + OP_SVE_FUNC(sum, w, 32, int32_t, add) + OP_SVE_FUNC(sum, w, 32, uint32_t, add) + OP_SVE_FUNC(sum, d, 64, int64_t, add) + OP_SVE_FUNC(sum, d, 64, uint64_t, add) + + OP_SVE_FUNC(sum, w, 32, float32_t, add) + OP_SVE_FUNC(sum, d, 64, float64_t, add) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_SVE_FUNC(prod, b, 8, int8_t, mul) + OP_SVE_FUNC(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC(prod, h, 16, int16_t, mul) + OP_SVE_FUNC(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC(prod, w, 32, int32_t, mul) + OP_SVE_FUNC(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC(prod, d, 64, int64_t, mul) + OP_SVE_FUNC(prod, d, 64, uint64_t, mul) + + OP_SVE_FUNC(prod, w, 32, float32_t, mul) + OP_SVE_FUNC(prod, d, 64, float64_t, mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_SVE_FUNC(band, b, 8, int8_t, and) + OP_SVE_FUNC(band, b, 8, uint8_t, and) + OP_SVE_FUNC(band, h, 16, int16_t, and) + OP_SVE_FUNC(band, h, 16, uint16_t, and) + OP_SVE_FUNC(band, w, 32, int32_t, and) + OP_SVE_FUNC(band, w, 32, uint32_t, and) + OP_SVE_FUNC(band, d, 64, int64_t, and) +OP_SVE_FUNC(band, d, 64, uint64_t, and) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_SVE_FUNC(bor, b, 8, int8_t, orr) + OP_SVE_FUNC(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC(bor, h, 16, int16_t, orr) + OP_SVE_FUNC(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC(bor, w, 32, int32_t, orr) + OP_SVE_FUNC(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC(bor, d, 64, int64_t, orr) +OP_SVE_FUNC(bor, d, 64, uint64_t, orr) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_SVE_FUNC(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC(bxor, d, 64, int64_t, eor) +OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) + +/* + * This is a three buffer (2 input and 1 output) version of the reduction + * routines, needed for some optimizations. + */ +#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \ + static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1, \ + void * restrict _in2, void * restrict _out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ + svbool_t Pg = svptrue_b##type_size(); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ + } \ + if (left_over !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, left_over); \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + } \ +} + +/************************************************************************* + * Max + *************************************************************************/ + OP_SVE_FUNC_3BUFF(max, b, 8, int8_t, max) + OP_SVE_FUNC_3BUFF(max, b, 8, uint8_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, int16_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, int32_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, int64_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max) + + OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max) + +/************************************************************************* + * Min + *************************************************************************/ + OP_SVE_FUNC_3BUFF(min, b, 8, int8_t, min) + OP_SVE_FUNC_3BUFF(min, b, 8, uint8_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, int16_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, int32_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, int64_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min) + + OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min) + + /************************************************************************* + * Sum + ************************************************************************/ + OP_SVE_FUNC_3BUFF(sum, b, 8, int8_t, add) + OP_SVE_FUNC_3BUFF(sum, b, 8, uint8_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, int16_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, int32_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, int64_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add) + + OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add) + +/************************************************************************* + * Product + *************************************************************************/ + OP_SVE_FUNC_3BUFF(prod, b, 8, int8_t, mul) + OP_SVE_FUNC_3BUFF(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, int16_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, int32_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, int64_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul) + + OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ + OP_SVE_FUNC_3BUFF(band, b, 8, int8_t, and) + OP_SVE_FUNC_3BUFF(band, b, 8, uint8_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, int16_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, int32_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, int64_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ + OP_SVE_FUNC_3BUFF(bor, b, 8, int8_t, orr) + OP_SVE_FUNC_3BUFF(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, int16_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, int32_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, int64_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ + OP_SVE_FUNC_3BUFF(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, int64_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) + +/** C integer ***********************************************************/ +#define C_INTEGER(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t, \ + [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t, \ + [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t, \ + [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \ + [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t, \ + [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \ + [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t, \ + [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t + + +/** Floating point, including all the Fortran reals *********************/ +#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t +#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t + +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) + +/* + * MPI_OP_NULL + * All types + */ +#define FLAGS_NO_FLOAT \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) +#define FLAGS \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ + OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) + +ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 2buff), + FLOATING_POINT(max, 2buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 2buff), + FLOATING_POINT(min, 2buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 2buff), + FLOATING_POINT(sum, 2buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 2buff), + FLOATING_POINT(prod, 2buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] = { + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 2buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 2buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 2buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* (MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE) */ + NULL, + }, + +}; + +ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 3buff), + FLOATING_POINT(max, 3buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 3buff), + FLOATING_POINT(min, 3buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 3buff), + FLOATING_POINT(sum, 3buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 3buff), + FLOATING_POINT(prod, 3buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] ={ + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 3buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 3buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 3buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE */ + NULL, + }, +}; diff --git a/ompi/mca/op/sve/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h new file mode 100644 index 00000000000..00db651cfac --- /dev/null +++ b/ompi/mca/op/sve/op_sve_functions.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/sve/op_sve.h" + +BEGIN_C_DECLS + +OMPI_DECLSPEC extern ompi_op_base_handler_fn_t +ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t +ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +END_C_DECLS + diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt new file mode 100644 index 00000000000..4d6025c95ee --- /dev/null +++ b/test/datatype/HOW_TO_TEST_README.txt @@ -0,0 +1,17 @@ +(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8. + compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c  +(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size.  +(3) SVE_MPI_Op.py: A python script which you can use to generate plot. + +ALL YOU NEED TO DO IS: +(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. +(2) Run sve_uint8_test.sh + This will generate 3 types of output files with names + a. sve-sum-vectorlength.txt ##MPI_SUM with SVE-enabled operation + b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation + c. sve-cpy-vectorlength.txt ## memcpy +(2) run python scripts as + $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt + make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt + example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt + diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am index 4366724a523..19c0d6489b3 100644 --- a/test/datatype/Makefile.am +++ b/test/datatype/Makefile.am @@ -16,7 +16,7 @@ if PROJECT_OMPI MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data - MPI_CHECKS = to_self + MPI_CHECKS = to_self reduce_local endif TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS) @@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) unpack_hetero_LDADD = \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la +reduce_local_SOURCES = reduce_local.c +reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) +reduce_local_LDADD = \ + $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + distclean: rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c new file mode 100644 index 00000000000..a9546ebcf1d --- /dev/null +++ b/test/datatype/Reduce_local_float.c @@ -0,0 +1,661 @@ +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" + +#define ARRAYSIZE 1024*1024 + +float in_float[ARRAYSIZE]; +float inout_float[ARRAYSIZE]; +float inout_float_for_check[ARRAYSIZE]; + +int8_t in_uint8[ARRAYSIZE]; +int8_t inout_uint8[ARRAYSIZE]; +int8_t inout_uint8_for_check[ARRAYSIZE]; + +uint16_t in_uint16[ARRAYSIZE]; +uint16_t inout_uint16[ARRAYSIZE]; +uint16_t inout_uint16_for_check[ARRAYSIZE]; + +uint32_t in_uint32[ARRAYSIZE]; +uint32_t inout_uint32[ARRAYSIZE]; +uint32_t inout_uint32_for_check[ARRAYSIZE]; + +uint64_t in_uint64[ARRAYSIZE]; +uint64_t inout_uint64[ARRAYSIZE]; +uint64_t inout_uint64_for_check[ARRAYSIZE]; + +double in_double[ARRAYSIZE]; +double inout_double[ARRAYSIZE]; +double inout_double_for_check[ARRAYSIZE]; + +int main(int argc, char **argv) { + + char *num_elem = argv[1]; + int count = atoi(num_elem); + char *type = argv[2]; + char *elem_size = argv[3]; + int elem_size1 = atoi(elem_size); + char *op = argv[4]; + + int i; + + for (i=0; i +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" + +#define ARRAYSIZE 32*1024*1024 + +int8_t in_uint8[ARRAYSIZE]; +int8_t inout_uint8[ARRAYSIZE]; +int8_t inout_uint8_for_check[ARRAYSIZE]; + +int main(int argc, char **argv) { + + char *num_elem = argv[1]; + int count = atoi(num_elem); + char *type = argv[2]; + char *elem_size = argv[3]; + int elem_size1 = atoi(elem_size); + char *op = argv[4]; + + int i; + + for (i=0; i +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" +#include "ompi/communicator/communicator.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/datatype/ompi_datatype.h" + +static void print_status(char* op, char* type, int correctness) +{ + if(correctness) + printf("%s %s [\033[1;32msuccess\033[0m]", op, type); + else + printf("%s %s [\033[1;31mfail\033[0m]", op, type); +} + +int main(int argc, char **argv) +{ + static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL; + int count, elem_size, rank, size, len, provided, correctness = 1, i; + double tstart, tend; + char *type, *op; + + if(argc < 4 ) { + fprintf(stderr, + "Less arguments than expected (we need at least 3): \n" + " : [i, u, f, d]\n" + " ; [sum, max, min, bor, bxor, mul, band]\n"); + exit(-1); + } + count = atoi(argv[1]); + type = argv[2]; + elem_size = atoi(argv[3]); + op = argv[4]; + + if( count <= 0 ) { + printf("The number of elements should be positive\n"); + exit(-1); + } + if( (0 != (elem_size%8)) || (elem_size <= 0) || (elem_size > 64) ) { + printf("The element type should be 8, 16, 32 or 64\n"); + exit(-2); + } + + in_buf = malloc(count * sizeof(double)); + inout_buf = malloc(count * sizeof(double)); + inout_check_buf = malloc(count * sizeof(double)); + + ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false); + + rank = ompi_comm_rank(MPI_COMM_WORLD); + size = ompi_comm_size(MPI_COMM_WORLD); + + if(*type=='i') { + if( 8 == elem_size ) { + int8_t *in_int8 = (int8_t*)in_buf, + *inout_int8 = (int8_t*)inout_buf, + *inout_int8_for_check = (int8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int8[i] = 5; + inout_int8[i] = inout_int8_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (int8_t)(in_int8[i] + inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != in_int8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "min") ) { //intentionly reversed in and out + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int8, in_int8, count, MPI_INT8_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != in_int8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] | inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] ^ inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8,inout_int8,count, MPI_INT8_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (int8_t)(in_int8[i] * inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] & inout_int8_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT8_T", correctness); + } + } + if( 16 == elem_size ) { + int16_t *in_int16 = (int16_t*)in_buf, + *inout_int16 = (int16_t*)inout_buf, + *inout_int16_for_check = (int16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int16[i] = 5; + inout_int16[i] = inout_int16_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_SUM); + tend = MPI_Wtime(); + for( i = 0; i < count; i++ ) { + if(inout_int16[i] != (int16_t)(in_int16[i] + inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != in_int16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int16, in_int16, count, MPI_INT16_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != in_int16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] | inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] ^ inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (int16_t)(in_int16[i] * inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] & inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT16_T", correctness); + } + } + if( 32 == elem_size ) { + int32_t *in_int32 = (int32_t*)in_buf, + *inout_int32 = (int32_t*)inout_buf, + *inout_int32_for_check = (int32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int32[i] = 5; + inout_int32[i] = inout_int32_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (int32_t)(in_int32[i] + inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != in_int32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int32, in_int32, count, MPI_INT32_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != in_int32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] | inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (int32_t)(in_int32[i] * inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] & inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] ^ inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT32_T", correctness); + } + } + if( 64 == elem_size ) { + int64_t *in_int64 = (int64_t*)in_buf, + *inout_int64 = (int64_t*)inout_buf, + *inout_int64_for_check = (int64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int64[i] = 5; + inout_int64[i] = inout_int64_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (int64_t)(in_int64[i] + inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != in_int64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != in_int64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] | inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] ^ inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64,inout_int64,count, MPI_INT64_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (int64_t)(in_int64[i] * inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] & inout_int64_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT64_T", correctness); + } + } + } + if(*type=='u') { + if( 8 == elem_size ) { + uint8_t *in_uint8 = (uint8_t*)in_buf, + *inout_uint8 = (uint8_t*)inout_buf, + *inout_uint8_for_check = (uint8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint8[i] = 5; + inout_uint8[i] = inout_uint8_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (uint8_t)(in_uint8[i] + inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != inout_uint8_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "min") ) { //intentionly reversed in and out + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != in_uint8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] | inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] ^ inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_UINT8_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (int8_t)(in_uint8[i] * inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] & inout_uint8_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT8_T", correctness); + } + } + if( 16 == elem_size ) { + uint16_t *in_uint16 = (uint16_t*)in_buf, + *inout_uint16 = (uint16_t*)inout_buf, + *inout_uint16_for_check = (uint16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint16[i] = 5; + inout_uint16[i] = inout_uint16_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_SUM); + tend = MPI_Wtime(); + for( i = 0; i < count; i++ ) { + if(inout_uint16[i] != (uint16_t)(in_uint16[i] + inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != inout_uint16_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != in_uint16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] ^ inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (uint16_t)(in_uint16[i] * inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] & inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT16_T", correctness); + } + } + if( 32 == elem_size ) { + uint32_t *in_uint32 = (uint32_t*)in_buf, + *inout_uint32 = (uint32_t*)inout_buf, + *inout_uint32_for_check = (uint32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint32[i] = 5; + inout_uint32[i] = inout_uint32_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (uint32_t)(in_uint32[i] + inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != inout_uint32_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != in_uint32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (uint32_t)(in_uint32[i] * inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] & inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] ^ inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT32_T", correctness); + } + } + if( 64 == elem_size ) { + int64_t *in_uint64 = (int64_t*)in_buf, + *inout_uint64 = (int64_t*)inout_buf, + *inout_uint64_for_check = (int64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint64[i] = 5; + inout_uint64[i] = inout_uint64_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (int64_t)(in_uint64[i] + inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != in_uint64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_uint64, in_uint64, count, MPI_UINT64_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != in_uint64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] ^ inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (int64_t)(in_uint64[i] * inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] & inout_uint64_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT64_T", correctness); + } + } + } + + if(*type=='f') { + float *in_float = (float*)in_buf, + *inout_float = (float*)inout_buf, + *inout_float_for_check = (float*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_float[i] = 1000.0+1; + inout_float[i] = inout_float_for_check[i] = 100.0+2; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != inout_float_for_check[i]+in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i] * inout_float_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_FLOAT", correctness); + } + } + + if(*type=='d') { + double *in_double = (double*)in_buf, + *inout_double = (double*)inout_buf, + *inout_double_for_check = (double*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_double[i] = 10.0+1; + inout_double[i] = inout_double_for_check[i] = 1.0+2; + } + + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != inout_double_for_check[i]+in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_DOUBLE", correctness); + } + + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_DOUBLE", correctness); + } + + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_double, in_double, count, MPI_DOUBLE, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_DOUBLE", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != inout_double_for_check[i]*in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_DOUBLE", correctness); + } + } + //tstart = MPI_Wtime(); + //memcpy(in_uint8,inout_uint8, count); + //memcpy(in_float, inout_float, count); + //memcpy(in_double, inout_double, count); + printf(" count %d time %.6f seconds\n",count, tend-tstart); + ompi_mpi_finalize(); + + free(in_buf); + free(inout_buf); + free(inout_check_buf); + + return correctness ? 0 : -1; +} + diff --git a/test/datatype/sve_uint8_test.sh b/test/datatype/sve_uint8_test.sh new file mode 100644 index 00000000000..ed0cc151592 --- /dev/null +++ b/test/datatype/sve_uint8_test.sh @@ -0,0 +1,21 @@ +echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation" +echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1 -np 1 /your_test_path/Reduce_local_float 1048576 i 8 max" + +# test all vector size +for vector_len in 128 256 512 1024 2048 +do + + echo "=========Integer type all operations & all sizes========" + echo "" + echo "" + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + for (( i=1; i<31; i++ )) + do + for val in 1024 4096 16384 65536 262144 1048576 4194304 16777216 33554432 + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8 $val i 8 sum | tee -a sve-sum-$vector_len.txt + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 0 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8 $val i 8 sum | tee -a no-sve-sum-$vector_len.txt + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8 $val i 8 cpy | tee -a sve-cpy-$vector_len.txt + done + done +done