diff --git a/configure.ac b/configure.ac index 16b31d5..3bf34c5 100755 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ dnl Init Autoconf/Automake/Libtool -AC_INIT([Sandia OpenSHMEM Test Suite], [1.4.1], [https://github.com/Sandia-OpenSHMEM/SOS]) +AC_INIT([Sandia OpenSHMEM Test Suite], [1.4.2], [https://github.com/Sandia-OpenSHMEM/SOS]) AC_PREREQ([2.60]) AC_CONFIG_AUX_DIR([config]) AC_CONFIG_MACRO_DIR([config]) @@ -76,7 +76,7 @@ AM_CONDITIONAL([HAVE_OPENMP], [test "$enable_threads" != "no" -a "$enable_openmp AC_ARG_ENABLE([lengthy-tests], [AC_HELP_STRING([--enable-lengthy-tests], - [Enable long running tests in the test suite (default: disabled)])]) + [Execute long running tests as part of "make check" (default: disabled)])]) AM_CONDITIONAL([ENABLE_LENGTHY_TESTS], [test "$enable_lengthy_tests" = "yes"]) AC_ARG_ENABLE([fortran], @@ -185,6 +185,7 @@ AM_CONDITIONAL([HAVE_CXX], [test "$enable_cxx" != "no" ]) AM_CONDITIONAL([ENABLE_PROFILING], [test "$enable_profiling" = "yes" ]) dnl make tests work in standalone mode +AM_CONDITIONAL([USE_PMI_MPI], [false]) AM_CONDITIONAL([USE_PMI_SIMPLE], [false]) AM_CONDITIONAL([USE_PORTALS4], [false]) AM_CONDITIONAL([HAVE_LONG_FORTRAN_HEADER], [false]) diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index 7c8e91b..22dd555 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -54,5 +54,5 @@ LDADD += $(top_builddir)/pmi-simple/libpmi_simple.la endif mandelbrot_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) -mandelbrot_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS) +mandelbrot_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS) mandelbrot_LDADD = $(LDADD) $(PTHREAD_CFLAGS) diff --git a/test/apps/gups.c b/test/apps/gups.c index f7fc03e..7ed085f 100644 --- a/test/apps/gups.c +++ b/test/apps/gups.c @@ -178,7 +178,7 @@ #define ZERO64B 0LL uint64_t TotalMemOpt = 8192; -int NumUpdatesOpt = 0; /* FIXME: This option is ignored */ +uint64_t NumUpdatesOpt = 0; double SHMEMGUPs; double SHMEMRandomAccess_ErrorsFraction; double SHMEMRandomAccess_time; @@ -324,9 +324,7 @@ SHMEMRandomAccess(void) double TotalMem; static int sAbort, rAbort; - uint64_t NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ - uint64_t NumUpdates; /* actual number of updates to table - may be smaller than - * NumUpdates_Default due to execution time bounds */ + uint64_t NumUpdates; /* total number of updates to table */ uint64_t ProcNumUpdates; /* number of updates per processor */ static long pSync_bcast[SHMEM_BCAST_SYNC_SIZE]; @@ -421,9 +419,13 @@ SHMEMRandomAccess(void) HPCC_PELock[i] = 0; /* Default number of global updates to table: 4x number of table entries */ - NumUpdates_Default = 4 * TableSize; - ProcNumUpdates = 4 * LocalTableSize; - NumUpdates = NumUpdates_Default; + if (NumUpdatesOpt == 0) { + ProcNumUpdates = 4 * LocalTableSize; + NumUpdates = 4 * TableSize; + } else { + ProcNumUpdates = NumUpdatesOpt; + NumUpdates = NumUpdatesOpt * NumProcs; + } if (MyProc == 0) { fprintf( outFile, "Running on %d processors\n", NumProcs); @@ -432,7 +434,7 @@ SHMEMRandomAccess(void) fprintf( outFile, "PE Main table size = (2^%" PRIu64 ")/%d = %" PRIu64 " words/PE MAX\n", logTableSize, NumProcs, LocalTableSize); - fprintf( outFile, "Default number of updates (RECOMMENDED) = %" PRIu64 "\n", NumUpdates_Default); + fprintf( outFile, "Total number of updates = %" PRIu64 "\n", NumUpdates); } /* Initialize main table */ diff --git a/test/apps/mandelbrot.c b/test/apps/mandelbrot.c index 0c2c112..09a98f7 100644 --- a/test/apps/mandelbrot.c +++ b/test/apps/mandelbrot.c @@ -193,8 +193,8 @@ static void *thread_worker(void *arg) { // Malloc local (non-symmetric) buffers pixels[0] = malloc(sizeof(int)*job_points); pixels[1] = malloc(sizeof(int)*job_points); - pe_mask = malloc(sizeof(int)*npes); - pe_ct_max = malloc(sizeof(int)*npes); + pe_mask = calloc(npes, sizeof(int)); + pe_ct_max = calloc(npes, sizeof(int)); if (NULL == pixels[0] || NULL == pixels[1] || NULL == pe_mask || NULL == pe_ct_max) { printf("%d, %d: Error, thread malloc failed\n", me, tid); diff --git a/test/include/Makefile.am b/test/include/Makefile.am index e9d484e..bf93767 100644 --- a/test/include/Makefile.am +++ b/test/include/Makefile.am @@ -12,4 +12,5 @@ # distribution. noinst_HEADERS = \ - uthash.h + uthash.h \ + pthread_barrier.h diff --git a/test/unit/pthread_barrier.h b/test/include/pthread_barrier.h similarity index 100% rename from test/unit/pthread_barrier.h rename to test/include/pthread_barrier.h diff --git a/test/performance/shmem_perf_suite/Makefile.am b/test/performance/shmem_perf_suite/Makefile.am index 104a67e..ef9134f 100644 --- a/test/performance/shmem_perf_suite/Makefile.am +++ b/test/performance/shmem_perf_suite/Makefile.am @@ -9,7 +9,9 @@ check_PROGRAMS = \ shmem_latency_put_perf \ + shmem_latency_put_perf_nb \ shmem_latency_get_perf \ + shmem_latency_get_perf_nb \ shmem_bw_put_perf \ shmem_bw_put_perf_nb \ shmem_bibw_put_perf \ @@ -18,8 +20,6 @@ check_PROGRAMS = \ shmem_bw_get_perf_nb \ shmem_bibw_get_perf \ shmem_bibw_get_perf_nb \ - shmem_latency_nb_put_perf \ - shmem_latency_nb_get_perf \ shmem_bw_atomics_perf \ shmem_bibw_atomics_perf @@ -28,7 +28,11 @@ check_PROGRAMS += \ shmem_bw_put_ctx_perf \ shmem_bw_put_ctx_perf_nb \ shmem_bibw_put_ctx_perf \ - shmem_bibw_put_ctx_perf_nb + shmem_bibw_put_ctx_perf_nb \ + shmem_latency_put_ctx_perf \ + shmem_latency_put_ctx_perf_nb \ + shmem_latency_get_ctx_perf \ + shmem_latency_get_ctx_perf_nb endif noinst_HEADERS = \ @@ -41,7 +45,8 @@ noinst_HEADERS = \ uni_dir_ctx.h \ bi_dir.h \ bi_dir_ctx.h \ - target_put.h + target_put.h \ + latency_ctx.h if ENABLE_LENGTHY_TESTS TESTS = $(check_PROGRAMS) @@ -65,6 +70,12 @@ if USE_PMI_SIMPLE LDADD += $(top_builddir)/pmi-simple/libpmi_simple.la endif +shmem_latency_put_perf_nb_SOURCES = shmem_latency_put_perf.c +shmem_latency_put_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API + +shmem_latency_get_perf_nb_SOURCES = shmem_latency_get_perf.c +shmem_latency_get_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API + shmem_bw_put_perf_nb_SOURCES = shmem_bw_put_perf.c shmem_bw_put_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API @@ -86,3 +97,15 @@ shmem_bibw_put_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS) shmem_bibw_put_ctx_perf_nb_SOURCES = shmem_bibw_put_ctx_perf.c shmem_bibw_put_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API + +shmem_latency_put_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS) + +shmem_latency_put_ctx_perf_nb_SOURCES = shmem_latency_put_ctx_perf.c +shmem_latency_put_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API + +shmem_latency_get_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS) + +shmem_latency_get_ctx_perf_nb_SOURCES = shmem_latency_get_ctx_perf.c +shmem_latency_get_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API + +AM_CPPFLAGS += -DENABLE_THREADS diff --git a/test/performance/shmem_perf_suite/bi_dir.h b/test/performance/shmem_perf_suite/bi_dir.h index d46437e..d857dfe 100644 --- a/test/performance/shmem_perf_suite/bi_dir.h +++ b/test/performance/shmem_perf_suite/bi_dir.h @@ -25,10 +25,10 @@ * SOFTWARE. */ -void static inline bi_bw_put(int len, perf_metrics_t *metric_info) +static inline void bi_bw_put(int len, perf_metrics_t * const metric_info) { double start = 0.0, end = 0.0; - int dest = partner_node(*metric_info); + int dest = partner_node(metric_info); unsigned long int i = 0, j = 0; static int check_once = 0; static int fin = -1; @@ -40,7 +40,7 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info) dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } @@ -59,7 +59,7 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info) } shmem_barrier_all(); - if (streaming_node(*metric_info)) { + if (streaming_node(metric_info)) { start = perf_shmemx_wtime(); } @@ -74,11 +74,11 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info) shmem_quiet(); } - if (streaming_node(*metric_info)) { + if (streaming_node(metric_info)) { shmem_int_p(&fin, 1, dest); shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0); end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } else { shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1); shmem_int_p(&fin, 0, dest); @@ -86,10 +86,10 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info) } -void static inline bi_bw_get(int len, perf_metrics_t *metric_info) +static inline void bi_bw_get(int len, perf_metrics_t * const metric_info) { double start = 0.0, end = 0.0; - int dest = partner_node(*metric_info); + int dest = partner_node(metric_info); unsigned long int i = 0, j = 0; static int check_once = 0; static int fin = -1; @@ -101,7 +101,7 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info) dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } @@ -125,7 +125,7 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info) } shmem_barrier_all(); - if (streaming_node(*metric_info)) { + if (streaming_node(metric_info)) { start = perf_shmemx_wtime(); } @@ -145,11 +145,11 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info) #endif } - if (streaming_node(*metric_info)) { + if (streaming_node(metric_info)) { shmem_int_p(&fin, 1, dest); shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0); end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } else { shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1); shmem_int_p(&fin, 0, dest); diff --git a/test/performance/shmem_perf_suite/bi_dir_ctx.h b/test/performance/shmem_perf_suite/bi_dir_ctx.h index 8b9fe10..07003c6 100644 --- a/test/performance/shmem_perf_suite/bi_dir_ctx.h +++ b/test/performance/shmem_perf_suite/bi_dir_ctx.h @@ -26,34 +26,30 @@ */ -void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) +static inline void bi_bw_ctx (int len, perf_metrics_t *metric_info) { double start = 0.0, end = 0.0; - int dest = partner_node(*metric_info); - int j = 0; - char *src = aligned_buffer_alloc(metric_info->nthreads * len); - char *dst = aligned_buffer_alloc(metric_info->nthreads * len); - assert(src && dst); + int dest = partner_node(metric_info); + unsigned long int i, j; static int check_once = 0; if (!check_once) { /* check to see whether sender and receiver are the same process */ if (dest == metric_info->my_node) { - fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", - dest); + fprintf(stderr, "Warning: Sender and receiver are the same " + "process (%d)\n", dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } shmem_barrier_all(); -#pragma omp parallel default(none) firstprivate(len, dest) private(j) \ - shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads) +#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) { - int i; const int thread_id = omp_get_thread_num(); shmem_ctx_t ctx; shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); @@ -61,9 +57,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) for (i = 0; i < metric_info->warmup; i++) { for(j = 0; j < metric_info->window_size; j++) { #ifdef USE_NONBLOCKING_API - shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #else - shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #endif } shmem_ctx_quiet(ctx); @@ -72,11 +70,10 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) } shmem_barrier_all(); - if (streaming_node(*metric_info)) { -#pragma omp parallel default(none) firstprivate(len, dest) private(j) \ - shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads) + if (streaming_node(metric_info)) { +#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) { - int i; const int thread_id = omp_get_thread_num(); shmem_ctx_t ctx; shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); @@ -89,9 +86,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) for (i = 0; i < metric_info->trials; i++) { for(j = 0; j < metric_info->window_size; j++) { #ifdef USE_NONBLOCKING_API - shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #else - shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #endif } shmem_ctx_quiet(ctx); @@ -99,10 +98,9 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) shmem_ctx_destroy(ctx); } } else { -#pragma omp parallel default(none) firstprivate(len, dest) private(j) \ - shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads) +#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) { - int i; const int thread_id = omp_get_thread_num(); shmem_ctx_t ctx; shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); @@ -110,9 +108,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) for (i = 0; i < metric_info->trials; i++) { for(j = 0; j < metric_info->window_size; j++) { #ifdef USE_NONBLOCKING_API - shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #else - shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #endif } shmem_ctx_quiet(ctx); @@ -122,14 +122,10 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info) } shmem_barrier_all(); - if (streaming_node(*metric_info)) { + if (streaming_node(metric_info)) { end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } shmem_barrier_all(); - - aligned_buffer_free(src); - aligned_buffer_free(dst); - } diff --git a/test/performance/shmem_perf_suite/bw_common.h b/test/performance/shmem_perf_suite/bw_common.h index 847c6ec..dfc4b32 100644 --- a/test/performance/shmem_perf_suite/bw_common.h +++ b/test/performance/shmem_perf_suite/bw_common.h @@ -31,431 +31,53 @@ #include #endif -#define MAX_MSG_SIZE (1<<23) -#define START_LEN 1 - -#define INC 2 -#define TRIALS 500 -#define WINDOW_SIZE 64 -#define WARMUP 50 - -#define TRIALS_LARGE 100 -#define WINDOW_SIZE_LARGE 64 -#define WARMUP_LARGE 10 -#define LARGE_MESSAGE_SIZE 8192 - -#define TARGET_SZ_MIN 8 -#define TARGET_SZ_MAX 4096 - -/*atomics common */ -#define ATOMICS_N_DTs 3 -/*note: ignoring cswap/swap for now in verification */ -#define ATOMICS_N_OPs 4 -/*PE 0 is printing its latency, thus have it not be the INCAST PE*/ -#define INCAST_PE 1 - -typedef enum { - UNI_DIR, - BI_DIR, -} bw_type; - -typedef enum { - STYLE_PUT, - STYLE_GET, - STYLE_RMA, - STYLE_ATOMIC -} bw_style; - -typedef enum { - FIRST_HALF, - SECOND_HALF, - FULL_SET -} red_PE_set; - -typedef enum { - COMM_PAIRWISE, - COMM_INCAST -} comm_style; - -typedef enum { - B, - KB, - MB -} bw_units; - -typedef struct perf_metrics { - unsigned long int start_len, max_len; - unsigned long int size_inc, trials; - unsigned long int window_size, warmup; - int validate; - int target_data; - int my_node, num_pes, sztarget, szinitiator, midpt; - bw_units unit; - char *src, *dest; - const char *bw_type; - bw_type type; - comm_style cstyle; - bw_style bwstyle; - int thread_safety; - int nthreads; - int individual_report; -} perf_metrics_t; - -long red_psync[SHMEM_REDUCE_SYNC_SIZE]; -long bar_psync[SHMEM_BARRIER_SYNC_SIZE]; +static const char * dt_names [] = { "uint", "ulong", "ulonglong" }; /*default settings if no input is provided */ -void static data_set_defaults(perf_metrics_t * data) { - data->start_len = START_LEN; - data->max_len = MAX_MSG_SIZE; - data->size_inc = INC; - data->trials = TRIALS; - data->window_size = WINDOW_SIZE; /*back-to-back msg stream*/ - data->warmup = WARMUP; /*number of initial iterations to skip*/ - data->unit = MB; - data->validate = false; - data->target_data = false; - data->my_node = -1; - data->num_pes = -1; - data->midpt = -1; - data->sztarget = -1; - data->szinitiator = -1; - data->src = NULL; - data->dest = NULL; - data->cstyle = COMM_PAIRWISE; - data->bwstyle = STYLE_RMA; - data->thread_safety = SHMEM_THREAD_SINGLE; - data->nthreads = 1; - data->individual_report = -1; +static +void init_metrics(perf_metrics_t *metric_info) { + metric_info->t_type = BW; + set_metric_defaults(metric_info); + + metric_info->unit = MB; + metric_info->target_data = false; + metric_info->cstyle = COMM_PAIRWISE; + metric_info->opstyle = STYLE_RMA; } -static int error_checking_init_target_usage(perf_metrics_t *metric_info) { - int error = false; - assert(metric_info->midpt > 0); - - if(metric_info->sztarget != -1 && metric_info->szinitiator != -1) - error = true; /* can't use them together */ - - if(metric_info->sztarget != -1) { - if(metric_info->sztarget < 1 || metric_info->sztarget > metric_info->midpt - || !metric_info->target_data) - error = true; - } else { - metric_info->sztarget = metric_info->midpt; - } - - if(metric_info->szinitiator != -1) { - if(metric_info->szinitiator < 1 || metric_info->szinitiator > metric_info->midpt - || !metric_info->target_data) - error = true; - } else { - metric_info->szinitiator = metric_info->midpt; - } - - if(error) { - fprintf(stderr, "invalid usage of command line arg -r/-l, use --help for info\n"); - return -1; - } - return 0; -} - -/* must use shmem_init beforehand */ -static int data_runtime_update(perf_metrics_t *data) { - data->my_node = shmem_my_pe(); - data->num_pes = shmem_n_pes(); - assert(data->num_pes); - data->midpt = data->num_pes/2; - return error_checking_init_target_usage(data); -} - -static const char * dt_names [] = { "int", "long", "longlong" }; - -void static bi_dir_data_init(perf_metrics_t * data) { - data->bw_type = "Bi-dir"; - data->type = BI_DIR; -} - -void static uni_dir_data_init(perf_metrics_t * data) { - data->bw_type = "Uni-dir"; - data->type = UNI_DIR; -} - - -int static inline partner_node(perf_metrics_t my_info) -{ - if(my_info.num_pes == 1) - return 0; - - if(my_info.cstyle == COMM_PAIRWISE) { - int pairs = my_info.midpt; - - return (my_info.my_node < pairs ? (my_info.my_node + pairs) : - (my_info.my_node - pairs)); - } else { - assert(my_info.cstyle == COMM_INCAST); - return INCAST_PE; - } -} - -int static inline streaming_node(perf_metrics_t my_info) -{ - if(my_info.cstyle == COMM_PAIRWISE) { - return (my_info.my_node < my_info.szinitiator); - } else { - assert(my_info.cstyle == COMM_INCAST); - return true; - } -} - -static int inline is_streaming_node(perf_metrics_t my_info, int node) -{ - if(my_info.cstyle == COMM_PAIRWISE) { - return (node < my_info.szinitiator); +static +void update_bw_type(perf_metrics_t *data, int b_type) { + if (b_type == BI_DIR) { + data->bw_type_str = "Bi-dir"; + data->b_type = BI_DIR; } else { - assert(my_info.cstyle == COMM_INCAST); - return true; + data->bw_type_str = "Uni-dir"; + data->b_type = UNI_DIR; } } -int static inline target_node(perf_metrics_t my_info) -{ - return (my_info.my_node >= my_info.midpt && - (my_info.my_node < (my_info.midpt + my_info.sztarget))); -} - -/* put/get bw use opposite streaming/validate nodes */ -red_PE_set static inline validation_set(perf_metrics_t my_info, int *nPEs) -{ - if(my_info.cstyle == COMM_PAIRWISE) { - if(streaming_node(my_info)) { - *nPEs = my_info.szinitiator; - return FIRST_HALF; - } else if(target_node(my_info)) { - *nPEs = my_info.sztarget; - return SECOND_HALF; - } else { - fprintf(stderr, "Warning: you are getting data from a node that " - "wasn't a part of the perf set \n "); - return 0; - } - } else { - assert(my_info.cstyle == COMM_INCAST); - *nPEs = my_info.num_pes; - return FULL_SET; - } -} - -/**************************************************************/ -/* Input Checking */ -/**************************************************************/ - -static int command_line_arg_check(int argc, char *argv[], - perf_metrics_t *metric_info) { - int ch, error = false; - extern char *optarg; - - /* check command line args */ - while ((ch = getopt(argc, argv, "e:s:n:w:p:r:l:kbivtC:T:")) != EOF) { - switch (ch) { - case 's': - metric_info->start_len = strtoul(optarg, (char **)NULL, 0); - if ( metric_info->start_len < 1 ) metric_info->start_len = 1; - if(!is_pow_of_2(metric_info->start_len)) { - fprintf(stderr, "Error: start_length must be a power of two\n"); - error = true; - } - if (metric_info->start_len > INT_MAX) { - fprintf(stderr, "Error: start_length is out of integer range\n"); - error = true; - } - break; - case 'e': - metric_info->max_len = strtoul(optarg, (char **)NULL, 0); - if(!is_pow_of_2(metric_info->max_len)) { - fprintf(stderr, "Error: end_length must be a power of two\n"); - error = true; - } - if(metric_info->max_len < metric_info->start_len) { - fprintf(stderr, "Error: end_length (%ld) must be >= " - "start_length (%ld)\n", metric_info->max_len, - metric_info->start_len); - error = true; - } - if (metric_info->max_len > INT_MAX) { - fprintf(stderr, "Error: end_length is out of integer range\n"); - error = true; - } - break; - case 'n': - metric_info->trials = strtoul(optarg, (char **)NULL, 0); - if(metric_info->trials < (metric_info->warmup*2)) { - fprintf(stderr, "Error: trials (%ld) must be >= 2*warmup " - "(%ld)\n", metric_info->trials, metric_info->warmup*2); - error = true; - } - break; - case 'p': - metric_info->warmup = strtoul(optarg, (char **)NULL, 0); - if(metric_info->warmup > (metric_info->trials/2)) { - fprintf(stderr, "Error: warmup (%ld) must be <= trials/2 " - "(%ld)\n", metric_info->warmup, metric_info->trials/2); - error = true; - } - break; - case 'k': - metric_info->unit = KB; - break; - case 'b': - metric_info->unit = B; - break; - case 'v': - metric_info->validate = true; - if(metric_info->target_data) error = true; - break; - case 'w': - metric_info->window_size = strtoul(optarg, (char **)NULL, 0); - if(metric_info->target_data) error = true; - break; - case 't': - metric_info->target_data = true; - metric_info->window_size = 1; - if(metric_info->validate) error = true; - break; - case 'r': - metric_info->sztarget = strtoul(optarg, (char **)NULL, 0); - break; - case 'l': - metric_info->szinitiator = strtoul(optarg, (char **)NULL, 0); - break; - case 'C': - if (strcmp(optarg, "SINGLE") == 0) { - metric_info->thread_safety = SHMEM_THREAD_SINGLE; - } else if (strcmp(optarg, "FUNNELED") == 0) { - metric_info->thread_safety = SHMEM_THREAD_FUNNELED; - } else if (strcmp(optarg, "SERIALIZED") == 0) { - metric_info->thread_safety = SHMEM_THREAD_SERIALIZED; - } else if (strcmp(optarg, "MULTIPLE") == 0) { - metric_info->thread_safety = SHMEM_THREAD_MULTIPLE; - } else { - fprintf(stderr, "Invalid threading level: \"%s\"\n", optarg); - error = true; - } - break; - case 'T': - metric_info->nthreads = atoi(optarg); - break; - case 'i': - metric_info->individual_report = 1; - break; - default: - error = true; - break; - } - } - - /* filling in 8/4KB chunks into array alloc'd to max_len */ - if(metric_info->target_data) { - metric_info->start_len = TARGET_SZ_MIN; - if((metric_info->max_len < - ((metric_info->trials + metric_info->warmup) * TARGET_SZ_MIN)) || - (metric_info->max_len < - ((metric_info->trials + metric_info->warmup) * TARGET_SZ_MAX))) { - error = true; - } - } - - if (error) { - if (metric_info->my_node == 0) { - fprintf(stderr, "Usage: \n[-s start_length] [-e end_length] " - ": lengths should be a power of two \n" - "[-n trials (must be greater than 2*warmup (default: x => 100))] \n" - "[-p warm-up (see trials for value restriction)] \n" - "[-w window size - iterations between completion, cannot use with -t] \n" - "[-k (kilobytes/second)] [-b (bytes/second)] \n" - "[-v (validate data stream)] \n" - "[-i (turn on individual bandwidth reporting)] \n" - "[-t output data for target side (default is initiator," - " only use with put_bw),\n cannot be used in conjunction " - "with validate, special sizes used, \ntrials" - " + warmup * sizes (8/4KB) <= max length \n" - "[-r number of nodes at target, use only with -t] \n" - "[-l number of nodes at initiator, use only with -t, " - "l/r cannot be used together] \n" - "[-C thread-safety-config: SINGLE, FUNNELED, SERIALIZED, or MULTIPLE] \n" - "[-T num-threads] \n"); - } - return -1; - } - return 0; -} - -static inline int only_even_PEs_check(int my_node, int num_pes) { - if (num_pes % 2 != 0) { - if (my_node == 0) { - fprintf(stderr, "Only even number of nodes can be used\n"); - } - return 77; - } else - return 0; -} - /**************************************************************/ /* Result Printing and Calc */ /**************************************************************/ -static const char *thread_safety_str(perf_metrics_t *metric_info) { - if (metric_info->thread_safety == SHMEM_THREAD_SINGLE) { - return "SINGLE"; - } else if (metric_info->thread_safety == SHMEM_THREAD_FUNNELED) { - return "FUNNELED"; - } else if (metric_info->thread_safety == SHMEM_THREAD_SERIALIZED) { - return "SERIALIZED"; - } else if (metric_info->thread_safety == SHMEM_THREAD_MULTIPLE) { - return "MULTIPLE"; - } else { - fprintf(stderr, "Unexpected thread safety value: %d. Setting it to SINGLE\n", metric_info->thread_safety); - metric_info->thread_safety = SHMEM_THREAD_SINGLE; - return "SINGLE"; - } -} +static +void print_atomic_header(perf_metrics_t * const metric_info) { + print_header(metric_info); + printf("\n\nBandwidth test type: %10s\n", metric_info->bw_type_str); -static void inline thread_safety_validation_check(perf_metrics_t *metric_info) { - if (metric_info->nthreads == 1) - return; - else { - if (metric_info->thread_safety != SHMEM_THREAD_MULTIPLE) { - if(metric_info->my_node == 0) { - fprintf(stderr, "Warning: argument \"-T %d\" is ignored because of the thread level specified." - " Switching to single thread with thread safety %s\n", metric_info->nthreads, - thread_safety_str(metric_info)); - } - metric_info->nthreads = 1; - } - return; - } -} - -void static print_atomic_results_header(perf_metrics_t metric_info) { - printf("\nSandia OpenSHMEM Performance Suite\n"); - printf("==================================\n"); - printf("Total Number of PEs: %10d\n", metric_info.num_pes); - printf("Iteration count: %10lu\n", metric_info.trials); - printf("Window size: %10lu\n", metric_info.window_size); - printf("Bandwidth test type: %10s\n", metric_info.bw_type); - - if (metric_info.cstyle == COMM_INCAST) { + if (metric_info->cstyle == COMM_INCAST) { printf("Communication style: INCAST\n"); } else { - assert(metric_info.cstyle == COMM_PAIRWISE); + assert(metric_info->cstyle == COMM_PAIRWISE); printf("Communication style: PAIRWISE\n"); } printf("\nOperation%15sBandwidth%15sMessage Rate%15sLatency\n", " ", " ", " "); - if (metric_info.unit == MB) { + if (metric_info->unit == MB) { printf("%19s in mbytes/sec"," "); - } else if (metric_info.unit == KB) { + } else if (metric_info->unit == KB) { printf("%19s in kbytes/sec", " "); } else { printf("%20s in bytes/sec", " "); @@ -464,26 +86,17 @@ void static print_atomic_results_header(perf_metrics_t metric_info) { printf("%15s in Mops/sec%15s in us\n", " ", " "); } -void static print_results_header(perf_metrics_t metric_info) { - printf("\nSandia OpenSHMEM Performance Suite\n"); - printf("==================================\n"); - printf("Total Number of PEs: %10d\n", metric_info.num_pes); - printf("Number of source PEs: %10d\n", metric_info.szinitiator); - printf("Number of target PEs: %10d\n", metric_info.sztarget); - printf("Iteration count: %10lu\n", metric_info.trials); - printf("Window size: %10lu\n", metric_info.window_size); - printf("Maximum message size: %10lu\n", metric_info.max_len); - printf("Number of threads: %10d\n", metric_info.nthreads); - printf("Thread safety: %10s\n", thread_safety_str(&metric_info)); - printf("Bandwidth test type: %10s\n", metric_info.bw_type); - - printf("\nMessage Size%15sBandwidth%15sMessage Rate\n", - " ", " "); +static +void print_bw_header(perf_metrics_t * const metric_info) { + print_header(metric_info); + printf("\n\nBandwidth test type: %10s\n", metric_info->bw_type_str); + + printf("\nMessage Size%15sBandwidth%15sMessage Rate\n", " ", " "); printf("%4sin bytes", " "); - if (metric_info.unit == MB) { + if (metric_info->unit == MB) { printf("%11sin mbytes/sec", " "); - } else if (metric_info.unit == KB) { + } else if (metric_info->unit == KB) { printf("%11sin kbytes/sec", " "); } else { printf("%12sin bytes/sec", " "); @@ -492,83 +105,69 @@ void static print_results_header(perf_metrics_t metric_info) { printf("%16sin msgs/sec\n", " "); } -void static print_data_results(double bw, double mr, perf_metrics_t data, +static +void print_data_results(double bw, double mr, const perf_metrics_t * const data, int len, double total_t) { static int atomic_type_index = 0; - if(data.target_data) { - if(data.my_node < data.midpt) { - printf("initiator:\n"); - } else { - printf("target:\n"); - } - } - - if (data.bwstyle == STYLE_ATOMIC) { + if (data->opstyle == STYLE_ATOMIC) { printf("%-10s", dt_names[atomic_type_index]); atomic_type_index = (atomic_type_index + 1) % ATOMICS_N_DTs; } else printf("%2s%10d", " ", len); - if(data.unit == KB) { + if(data->unit == KB) { bw = bw * 1.0e3; - } else if(data.unit == B) { + } else if(data->unit == B) { bw = bw * 1.0e6; } - if (data.bwstyle == STYLE_ATOMIC) { - printf("%13s%10.2f%15s%12.2f%12s%10.2f\n", " ", bw, " ", - mr/1.0e6, " ", total_t/(data.trials * data.window_size)); + if (data->opstyle == STYLE_ATOMIC) { + printf("%13s%10.2f%15s%12.2f%12s%10.2f", " ", bw, " ", + mr/1.0e6, " ", total_t/(data->trials * data->window_size)); } else - printf("%14s%10.2f%15s%12.2f\n", " ", bw, " ", mr); -} - - -/* reduction to collect performance results from PE set - then start_pe will print results --- assumes num_pes is even */ -void static inline PE_set_used_adjustments(int *nPEs, int *stride, int *start_pe, - perf_metrics_t my_info) -{ - red_PE_set PE_set = validation_set(my_info, nPEs); + printf("%14s%10.2f%15s%12.2f", " ", bw, " ", mr); - if(PE_set == FIRST_HALF || PE_set == FULL_SET) { - *start_pe = 0; - } - else { - assert(PE_set == SECOND_HALF); - *start_pe = my_info.midpt; + if(data->target_data) { + if(data->my_node < data->szinitiator) { + printf("%2sIniter", " "); + } else { + printf("%2sTarget", " "); + } } - *stride = 0; /* back to back PEs */ + printf("\n"); } - -void static inline calc_and_print_results(double end_t, double start_t, int len, - perf_metrics_t metric_info) -{ +static inline +void calc_and_print_results(double end_t, double start_t, int len, + perf_metrics_t * const metric_info) { int stride = 0, start_pe = 0, nPEs = 0; static double pe_bw_sum, bw = 0.0; /*must be symmetric for reduction*/ double pe_bw_avg = 0.0, pe_mr_avg = 0.0; int nred_elements = 1; static double pwrk[SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static double pe_time_start, pe_time_end, end_time_max = 0.0, start_time_min = 0.0; + static double pe_time_start, pe_time_end, + end_time_max = 0.0, start_time_min = 0.0; double total_t = 0.0, total_t_max = 0.0; int multiplier = 1; PE_set_used_adjustments(&nPEs, &stride, &start_pe, metric_info); /* 2x as many messages at once for bi-directional */ - if(metric_info.type == BI_DIR) + if(metric_info->b_type == BI_DIR) multiplier = 2; if (end_t > 0 && start_t > 0 && (end_t - start_t) > 0) { total_t = end_t - start_t; #ifdef ENABLE_OPENMP - bw = ((double) len * (double) multiplier / 1.0e6 * metric_info.window_size * metric_info.trials * - (double) metric_info.nthreads) / (total_t / 1.0e6); + bw = ((double) len * (double) metric_info->num_partners * (double) multiplier / 1.0e6 * + metric_info->window_size * metric_info->trials * + (double) metric_info->nthreads) / (total_t / 1.0e6); #else - bw = ((double) len * (double) multiplier / 1.0e6 * metric_info.window_size * metric_info.trials) / - (total_t / 1.0e6); + bw = ((double) len * (double) metric_info->num_partners * (double) multiplier / 1.0e6 * + metric_info->window_size * metric_info->trials) / + (total_t / 1.0e6); #endif } else { fprintf(stderr, "Incorrect time measured from bandwidth test: " @@ -578,90 +177,102 @@ void static inline calc_and_print_results(double end_t, double start_t, int len, /* base case: will be overwritten by collective if num_pes > 2 */ pe_bw_sum = bw; - if (metric_info.individual_report == 1) { - printf("Individual bandwith for PE %6d is %10.2f\n", - metric_info.my_node, pe_bw_sum); + if (metric_info->individual_report == 1) { + if (metric_info->my_node < metric_info->midpt) { + printf("Individual bandwith for PE %6d (initer) is %10.2f\n", + metric_info->my_node, pe_bw_sum); + } else { + printf("Individual bandwith for PE %6d (target) is %10.2f\n", + metric_info->my_node, pe_bw_sum); + } } pe_time_start = start_t; pe_time_end = end_t; shmem_barrier(start_pe, stride, nPEs, bar_psync); - if (nPEs >= 2) { - shmem_double_min_to_all(&start_time_min, &pe_time_start, nred_elements, + if (metric_info->cstyle != COMM_INCAST) { + if (nPEs >= 2) { + shmem_double_min_to_all(&start_time_min, &pe_time_start, nred_elements, start_pe, stride, nPEs, pwrk, red_psync); - shmem_barrier(start_pe, stride, nPEs, bar_psync); - shmem_double_max_to_all(&end_time_max, &pe_time_end, nred_elements, + shmem_barrier(start_pe, stride, nPEs, bar_psync); + shmem_double_max_to_all(&end_time_max, &pe_time_end, nred_elements, start_pe, stride, nPEs, pwrk, red_psync); - } else if (nPEs == 1) { - start_time_min = pe_time_start; - end_time_max = pe_time_end; - } + } else if (nPEs == 1) { + start_time_min = pe_time_start; + end_time_max = pe_time_end; + } - /* calculating bandwidth based on the highest time duration across all PEs */ - if (end_time_max > 0 && start_time_min > 0 && - (end_time_max - start_time_min) > 0) { + /* calculating bandwidth based on the highest time duration across all PEs */ + if (end_time_max > 0 && start_time_min > 0 && + (end_time_max - start_time_min) > 0) { - total_t_max = (end_time_max - start_time_min); + total_t_max = (end_time_max - start_time_min); + int total_transfers = MAX(metric_info->szinitiator, metric_info->sztarget); #ifdef ENABLE_OPENMP - bw = ((double) len * (double) multiplier * (double) metric_info.midpt / 1.0e6 * metric_info.window_size * - metric_info.trials * (double) metric_info.nthreads) / - (total_t_max / 1.0e6); + bw = ((double) len * (double) multiplier * (double) total_transfers / + 1.0e6 * metric_info->window_size * metric_info->trials * + (double) metric_info->nthreads) / (total_t_max / 1.0e6); #else - bw = ((double) len * (double) multiplier * (double) metric_info.midpt / 1.0e6 * metric_info.window_size * - metric_info.trials) / (total_t_max / 1.0e6); + bw = ((double) len * (double) multiplier * (double) total_transfers / + 1.0e6 * metric_info->window_size * metric_info->trials) / + (total_t_max / 1.0e6); #endif - } else { - fprintf(stderr, "Incorrect time measured from bandwidth test: " + } else { + fprintf(stderr, "Incorrect time measured from bandwidth test: " "start_min = %lf, end_max = %lf\n", start_time_min, end_time_max); - } - - pe_bw_sum = bw; + } + pe_bw_sum = bw; + } else { + if (nPEs >= 2) { + shmem_double_sum_to_all(&pe_bw_sum, &bw, nred_elements, + start_pe, stride, nPEs, pwrk, + red_psync); + } else if (nPEs == 1) { + pe_bw_sum = bw; + } + } /* aggregate bw since bw op pairs are communicating simultaneously */ - if(metric_info.my_node == start_pe) { + if(metric_info->my_node == start_pe) { pe_bw_avg = pe_bw_sum; pe_mr_avg = pe_bw_avg / (len / 1.0e6); print_data_results(pe_bw_avg, pe_mr_avg, metric_info, len, total_t); } } -void static inline large_message_metric_chg(perf_metrics_t *metric_info, int len) { - if(len > LARGE_MESSAGE_SIZE) { - metric_info->window_size = WINDOW_SIZE_LARGE; - metric_info->trials = TRIALS_LARGE; - metric_info->warmup = WARMUP_LARGE; - } -} - -static void validate_atomics(perf_metrics_t m_info) { +static int validate_atomics(perf_metrics_t * const m_info) { int snode = streaming_node(m_info); - int * my_buf = (int *)m_info.dest; - bw_type tbw = m_info.type; - int expected_val = 0; - unsigned int ppe_exp_val = ((m_info.trials + m_info.warmup) * m_info.window_size - * ATOMICS_N_DTs * ATOMICS_N_OPs) + m_info.my_node; - - if(m_info.cstyle == COMM_INCAST) { - if(tbw == BI_DIR) + int * my_buf = (int *)m_info->dest; + bw_type tbw = m_info->b_type; + int expected_val = 0, errors = 0; + unsigned int ppe_exp_val = ((m_info->trials + m_info->warmup) * m_info->window_size + * ATOMICS_N_DTs * ATOMICS_N_OPs) + m_info->my_node; + + if (m_info->cstyle == COMM_INCAST) { + if (tbw == BI_DIR) printf("WARNING: This use-case is not currently well defined\n"); - if(m_info.my_node == 0) { - expected_val = ppe_exp_val * m_info.num_pes; + if (m_info->my_node == 0) { + expected_val = ppe_exp_val * m_info->num_pes; } else - expected_val = m_info.my_node; + expected_val = m_info->my_node; } else { - assert(m_info.cstyle == COMM_PAIRWISE); + assert(m_info->cstyle == COMM_PAIRWISE); expected_val = ppe_exp_val; } - if((!snode && tbw == UNI_DIR) || tbw == BI_DIR) { - if(my_buf[0] != expected_val) - printf("validation error for PE %d: %d != %d \n", m_info.my_node, my_buf[0], + if ((!snode && tbw == UNI_DIR) || tbw == BI_DIR) { + if(my_buf[0] != expected_val) { + printf("Validation error for PE %d: %d != %d \n", m_info->my_node, my_buf[0], expected_val); + errors++; + } } + + return errors; } /**************************************************************/ @@ -673,33 +284,37 @@ static void validate_atomics(perf_metrics_t m_info) { * NOTE: post function validation assumptions, data isn't flushed pre/post */ extern void bi_dir_bw(int len, perf_metrics_t *metric_info); -void static inline bi_dir_bw_test_and_output(perf_metrics_t metric_info) { +static inline +void bi_dir_bw_test_and_output(perf_metrics_t * const metric_info) { int partner_pe = partner_node(metric_info); unsigned long int len; - if(metric_info.my_node == 0) { - if (metric_info.bwstyle == STYLE_ATOMIC) - print_atomic_results_header(metric_info); + if(metric_info->my_node == 0) { + if (metric_info->opstyle == STYLE_ATOMIC) + print_atomic_header(metric_info); else - print_results_header(metric_info); + print_bw_header(metric_info); } - for (len = metric_info.start_len; len <= metric_info.max_len; - len *= metric_info.size_inc) { + for (len = metric_info->start_len; len <= metric_info->max_len; + len *= metric_info->size_inc) { - large_message_metric_chg(&metric_info, len); + large_message_metric_chg(metric_info, len); - bi_dir_bw(len, &metric_info); + bi_dir_bw(len, metric_info); } shmem_barrier_all(); - if(metric_info.validate) { - if(metric_info.bwstyle != STYLE_ATOMIC) { - validate_recv(metric_info.dest, metric_info.max_len, partner_pe); + if (metric_info->validate) { + int errors = -1; + if (metric_info->opstyle != STYLE_ATOMIC) { + errors = validate_recv(metric_info->dest, metric_info->max_len, partner_pe); } else { - validate_atomics(metric_info); + errors = validate_atomics(metric_info); } + if (errors >= 0) + printf("Validation complete (%d errors)\n", errors); } } @@ -707,39 +322,43 @@ void static inline bi_dir_bw_test_and_output(perf_metrics_t metric_info) { /* UNI-Directional BW */ /**************************************************************/ -/*have one symmetric char array metric_info->buf of max_len to use for +/* have one symmetric char array metric_info->buf of max_len to use for * calculation initalized with my_node number * NOTE: post function validation assumptions, data isn't flushed pre/post */ extern void uni_dir_bw(int len, perf_metrics_t *metric_info); -void static inline uni_dir_bw_test_and_output(perf_metrics_t metric_info) { +static inline +void uni_dir_bw_test_and_output(perf_metrics_t * const metric_info) { int partner_pe = partner_node(metric_info); unsigned long int len = 0; - if(metric_info.my_node == 0) { - if (metric_info.bwstyle == STYLE_ATOMIC) - print_atomic_results_header(metric_info); + if(metric_info->my_node == 0) { + if (metric_info->opstyle == STYLE_ATOMIC) + print_atomic_header(metric_info); else - print_results_header(metric_info); + print_bw_header(metric_info); } - for (len = metric_info.start_len; len <= metric_info.max_len; - len *= metric_info.size_inc) { + for (len = metric_info->start_len; len <= metric_info->max_len; + len *= metric_info->size_inc) { - large_message_metric_chg(&metric_info, len); + large_message_metric_chg(metric_info, len); - uni_dir_bw(len, &metric_info); + uni_dir_bw(len, metric_info); } shmem_barrier_all(); - if(metric_info.validate) { - if((streaming_node(metric_info) && metric_info.bwstyle == STYLE_GET) || - (target_node(metric_info) && metric_info.bwstyle == STYLE_PUT)) { - validate_recv(metric_info.dest, metric_info.max_len, partner_pe); - } else if(metric_info.bwstyle == STYLE_ATOMIC) { - validate_atomics(metric_info); + if (metric_info->validate) { + int errors = -1; + if ((streaming_node(metric_info) && metric_info->opstyle == STYLE_GET) || + (target_node(metric_info) && metric_info->opstyle == STYLE_PUT)) { + errors = validate_recv(metric_info->dest, metric_info->max_len, partner_pe); + } else if (metric_info->opstyle == STYLE_ATOMIC) { + errors = validate_atomics(metric_info); } + if (errors >= 0) + printf("Validation complete (%d errors)\n", errors); } } @@ -747,187 +366,132 @@ void static inline uni_dir_bw_test_and_output(perf_metrics_t metric_info) { /* INIT and teardown of resources */ /**************************************************************/ -/*create and init (with my_PE_num) two symmetric arrays on the heap */ -static inline int bw_init_data_stream(perf_metrics_t *metric_info, - int argc, char *argv[]) { +/* create and init (with my_PE_num) two symmetric arrays on the heap */ +static inline +int bw_init_data_stream(perf_metrics_t * const metric_info, + int argc, char *argv[]) { - int i = 0; - data_set_defaults(metric_info); + init_metrics(metric_info); int ret = command_line_arg_check(argc, argv, metric_info); - if (ret != 0) { - return -1; - } #ifndef VERSION_1_0 +#if defined(ENABLE_THREADS) int tl; shmem_init_thread(metric_info->thread_safety, &tl); if(tl != metric_info->thread_safety) { fprintf(stderr,"Could not initialize with requested thread " "level %d: got %d\n", metric_info->thread_safety, tl); - return -2; + return -1; } +#else + shmem_init(); +#endif #else start_pes(0); #endif - if (data_runtime_update(metric_info) == -1) - return -2; - thread_safety_validation_check(metric_info); - metric_info->sztarget = metric_info->midpt; - metric_info->szinitiator = metric_info->midpt; + update_metrics(metric_info); - for(i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) - red_psync[i] = SHMEM_SYNC_VALUE; + if (ret) { + if (metric_info->my_node == 0) { + print_usage(ret); + } + return -1; + } else { + if (metric_info->num_pes < 2) { + fprintf(stderr, "This test requires at least two processes.\n"); + print_usage(1); + return -1; + } + } - for(i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) - bar_psync[i] = SHMEM_SYNC_VALUE; + if (error_checking_init_target_usage(metric_info) == -1) + return -1; +#if defined(ENABLE_THREADS) + thread_safety_validation_check(metric_info); +#endif + init_psync_arrays(); - if (only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) { - return -2; + if(only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) { + return -1; } - metric_info->src = aligned_buffer_alloc(metric_info->max_len); - init_array(metric_info->src, metric_info->max_len, metric_info->my_node); + metric_info->src = aligned_buffer_alloc(metric_info->max_len * metric_info->nthreads); + init_array(metric_info->src, metric_info->max_len * metric_info->nthreads, metric_info->my_node); - metric_info->dest = aligned_buffer_alloc(metric_info->max_len); - init_array(metric_info->dest, metric_info->max_len, metric_info->my_node); + metric_info->dest = aligned_buffer_alloc(metric_info->max_len * metric_info->nthreads); + init_array(metric_info->dest, metric_info->max_len * metric_info->nthreads, metric_info->my_node); return 0; } -static inline int bi_dir_init(perf_metrics_t *metric_info, int argc, - char *argv[]) { +static inline +int bi_dir_init(perf_metrics_t * const metric_info, int argc, + char *argv[], op_style opstyle) { int ret = bw_init_data_stream(metric_info, argc, argv); if (ret == 0) { - bi_dir_data_init(metric_info); + metric_info->opstyle = opstyle; + update_bw_type(metric_info, BI_DIR); return 0; } else return ret; } -static inline int uni_dir_init(perf_metrics_t *metric_info, int argc, - char *argv[], bw_style bwstyl) { +static inline +int uni_dir_init(perf_metrics_t * const metric_info, int argc, + char *argv[], op_style opstyle) { int ret = bw_init_data_stream(metric_info, argc, argv); if (ret == 0) { /* uni-dir validate needs to know if its a put or get */ - metric_info->bwstyle = bwstyl; - uni_dir_data_init(metric_info); + metric_info->opstyle = opstyle; + update_bw_type(metric_info, UNI_DIR); return 0; } else return ret; } -void static inline bw_data_free(perf_metrics_t *metric_info) { +static inline +void bw_data_free(const perf_metrics_t * const metric_info) { shmem_barrier_all(); aligned_buffer_free(metric_info->src); aligned_buffer_free(metric_info->dest); } -static void inline bw_finalize(void) { +static inline +void bw_finalize(void) { #ifndef VERSION_1_0 shmem_finalize(); #endif } -void static inline bi_dir_bw_main(int argc, char *argv[]) { +static inline +void bi_dir_bw_main(int argc, char *argv[], op_style opstyle) { perf_metrics_t metric_info; - int ret = bi_dir_init(&metric_info, argc, argv); + int ret = bi_dir_init(&metric_info, argc, argv, opstyle); if (ret == 0) { - bi_dir_bw_test_and_output(metric_info); + bi_dir_bw_test_and_output(&metric_info); bw_data_free(&metric_info); } - if (ret != -1) - bw_finalize(); -} /*main() */ + bw_finalize(); +} -void static inline uni_dir_bw_main(int argc, char *argv[], bw_style bwstyl) { +static inline +void uni_dir_bw_main(int argc, char *argv[], op_style opstyle) { perf_metrics_t metric_info; - int ret = uni_dir_init(&metric_info, argc, argv, bwstyl); + int ret = uni_dir_init(&metric_info, argc, argv, opstyle); if (ret == 0) { - uni_dir_bw_test_and_output(metric_info); + uni_dir_bw_test_and_output(&metric_info); bw_data_free(&metric_info); } - if (ret != -1) - bw_finalize(); -} /*main() */ - -static inline int check_hostname_validation(perf_metrics_t my_info) { - - int hostname_status = -1; - - /* hostname_size should be a length divisible by 4 */ - int hostname_size = (MAX_HOSTNAME_LEN % 4 == 0) ? MAX_HOSTNAME_LEN : - MAX_HOSTNAME_LEN + (4 - MAX_HOSTNAME_LEN % 4); - int i, errors = 0; - - /* pSync for fcollect of hostnames */ - static long pSync_collect[SHMEM_COLLECT_SYNC_SIZE]; - for (i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++) - pSync_collect[i] = SHMEM_SYNC_VALUE; - - char *hostname = (char *) shmem_malloc (hostname_size * sizeof(char)); - char *dest = (char *) shmem_malloc (my_info.num_pes * hostname_size * sizeof(char)); - - hostname_status = gethostname(hostname, hostname_size); - if (hostname_status != 0) { - fprintf(stderr, "gethostname failed (%d)\n", hostname_status); - return -1; - } - shmem_barrier_all(); - - /* nelems needs to be updated based on 32-bit API */ - shmem_fcollect32(dest, hostname, hostname_size/4, 0, 0, my_info.num_pes, pSync_collect); - - char *snode_name = NULL; - char *tnode_name = NULL; - for (i = 0; i < my_info.num_pes; i++) { - char *curr_name = &dest[i * hostname_size]; - - if (is_streaming_node(my_info, i)) { - if (snode_name == NULL) { - snode_name = curr_name; - } - - if (strncmp(snode_name, curr_name, hostname_size) != 0) { - fprintf(stderr, "PE %d on %s is a streaming node " - "but not placed on %s\n", i, curr_name, snode_name); - errors++; - } - } else { - if (tnode_name == NULL) { - tnode_name = curr_name; - } - - if (strncmp(tnode_name, curr_name, hostname_size) != 0) { - fprintf(stderr, "PE %d on %s is a target node " - "but not placed on %s\n", i, curr_name, tnode_name); - errors++; - } - } - } - - if (snode_name == NULL || tnode_name == NULL) { - fprintf(stderr, "Error: no streaming or target node\n"); - return -1; - } - - if (strncmp(snode_name, tnode_name, hostname_size) == 0) { - fprintf(stderr, "Warning: senders and receivers are running on the " - "same node %s\n", snode_name); - } - - shmem_free(dest); - shmem_free(hostname); - - return errors; -} + bw_finalize(); +} diff --git a/test/performance/shmem_perf_suite/common.h b/test/performance/shmem_perf_suite/common.h index c48f66e..2f0b556 100644 --- a/test/performance/shmem_perf_suite/common.h +++ b/test/performance/shmem_perf_suite/common.h @@ -35,18 +35,186 @@ #include #include #include - #include #include +/* hostname length to check for hostname errors */ #ifdef MAXHOSTNAMELEN #define MAX_HOSTNAME_LEN MAXHOSTNAMELEN #else #define MAX_HOSTNAME_LEN HOST_NAME_MAX #endif +#ifndef MAX +#define MAX(A,B) (((A)>(B)) ? (A) : (B)) +#endif + #define ONE 1 +/* constants for experiments */ +#define MAX_MSG_SIZE (1<<23) +#define START_LEN 1 +#define INC 2 +#define TRIALS 1000 +#define WINDOW_SIZE 64 +#define WARMUP 100 + +/* constants for experiments with large message sizes */ +#define TRIALS_LARGE 100 +#define WINDOW_SIZE_LARGE 64 +#define WARMUP_LARGE 10 +#define LARGE_MESSAGE_SIZE 65536 + +#define TARGET_SZ_MIN 8 +#define TARGET_SZ_MAX 4096 + +/* atomics common */ +#define ATOMICS_N_DTs 3 +/* note: ignoring cswap/swap for now in verification */ +#define ATOMICS_N_OPs 4 +/* PE 0 is printing its latency, thus have it not be the INCAST PE*/ +#define INCAST_PE 1 + +/* perf metrics structures */ +typedef enum { + LAT, + BW +} test_type; + +typedef enum { + UNI_DIR, + BI_DIR +} bw_type; + +typedef enum { + STYLE_PUT, + STYLE_GET, + STYLE_RMA, + STYLE_ATOMIC +} op_style; + +typedef enum { + FIRST_HALF, + SECOND_HALF, + FULL_SET +} red_PE_set; + +typedef enum { + COMM_PAIRWISE, + COMM_INCAST +} comm_style; + +typedef enum { + B, + KB, + MB +} bw_units; + +typedef enum { + OP_FETCH, + OP_SET, + OP_CSWAP, + OP_SWAP, + OP_FINC, + OP_INC, + OP_FADD, + OP_ADD, + OP_FAND, + OP_AND, + OP_FOR, + OP_OR, + OP_FXOR, + OP_XOR, + SIZE_OF_OP +} atomic_op_type; + + +typedef struct perf_metrics { + /* common parameters */ + test_type t_type; + unsigned long int start_len, max_len; + unsigned long int size_inc, trials; + unsigned long int window_size, warmup; + int my_node, num_pes, sztarget, szinitiator, midpt; + char *src, *dest; + op_style opstyle; + + /* parameters for threaded tests */ + int nthreads; + int thread_safety; + + /* parameters specific to bandwidth tests */ + bw_units unit; + const char *bw_type_str; + bw_type b_type; + comm_style cstyle; + int target_data; + int num_partners; + + /* parameters specific to latency tests */ + long *target; + + /* misc parameters */ + int validate; + int individual_report; +} perf_metrics_t; + +/* psync arrays used in metric calculation */ +long red_psync[SHMEM_REDUCE_SYNC_SIZE]; +long bar_psync[SHMEM_BARRIER_SYNC_SIZE]; + +/* default settings with no input provided */ +static inline +void set_metric_defaults(perf_metrics_t *metric_info) { + metric_info->start_len = START_LEN; + metric_info->max_len = MAX_MSG_SIZE; + metric_info->size_inc = INC; + metric_info->trials = TRIALS; + metric_info->window_size = WINDOW_SIZE; /*back-to-back msg stream*/ + metric_info->warmup = WARMUP; /*number of initial iterations to skip*/ + + metric_info->my_node = -1; + metric_info->num_pes = -1; + metric_info->midpt = -1; + metric_info->sztarget = -1; + metric_info->szinitiator = -1; + + metric_info->src = NULL; + metric_info->dest = NULL; + + metric_info->num_partners = 1; + +#if defined(ENABLE_THREADS) + metric_info->thread_safety = SHMEM_THREAD_SINGLE; +#else + metric_info->thread_safety = 0; +#endif + metric_info->nthreads = 1; + + metric_info->validate = false; + metric_info->individual_report = -1; +} + +/* update metrics after shmem init */ +static inline +void update_metrics(perf_metrics_t *metric_info) { + metric_info->my_node = shmem_my_pe(); + metric_info->num_pes = shmem_n_pes(); + assert(metric_info->num_pes); + metric_info->midpt = metric_info->num_pes / 2; +} + +/* init psync arrays */ +static inline +void init_psync_arrays(void) { + int i; + for(i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) + red_psync[i] = SHMEM_SYNC_VALUE; + + for(i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) + bar_psync[i] = SHMEM_SYNC_VALUE; +} + /* return microseconds */ double perf_shmemx_wtime(void); @@ -107,7 +275,6 @@ static char * aligned_buffer_alloc(int len) static void aligned_buffer_free(char * ptr_aligned) { - char * ptr_org; uintptr_t temp_p; size_t ptr_size = sizeof(uintptr_t); @@ -123,47 +290,506 @@ static void aligned_buffer_free(char * ptr_aligned) #endif } -int static inline is_divisible_by_4(int num) -{ - assert(num >= 0); - assert(sizeof(int) == 4); - return (!(num & 0x00000003)); +static inline +int is_divisible_by_4(int num) { + if (num < 0) + shmem_global_exit(1); + return (num % 4 == 0); } /*to be a power of 2 must only have 1 set bit*/ -int static inline is_pow_of_2(unsigned int num) -{ +static inline +int is_pow_of_2(unsigned int num) { /*move first set bit all the way to right*/ while(num && !((num >>=1 ) & 1)); /*it will be 1 if its the only set bit*/ - return ((num == 1 || num == 0)? true : false); + return ((num == 1 || num == 0) ? true : false); } -void static init_array(char * const buf, int len, int my_pe_num) -{ +static +void init_array(const char *buf, int len, int my_pe_num) { int i = 0; int array_size = len / sizeof(int); - int * ibuf = (int *)buf; + int *ibuf = (int *)buf; assert(is_divisible_by_4(len)); for(i = 0; i < array_size; i++) ibuf[i] = my_pe_num; - } -void static inline validate_recv(char * buf, int len, int partner_pe) -{ +static inline +int validate_recv(char *buf, int len, int partner_pe) { int i = 0; int array_size = len / sizeof(int); - int * ibuf = (int *)buf; + int *ibuf = (int *)buf; + int errors = 0; assert(is_divisible_by_4(len)); - for(i = 0; i < array_size; i++) { - if(ibuf[i] != partner_pe) - printf("validation error at index %d: %d != %d \n", i, ibuf[i], - partner_pe); + for (i = 0; i < array_size; i++) { + if (ibuf[i] != partner_pe) { + errors++; + } + } + if (errors > 0) { + printf("Validation error: stored_value = %d, expected value = %d\n", + ibuf[0], partner_pe); } + return errors; +} + +/**************************************************************/ +/* Input Checking */ +/**************************************************************/ + +static +int command_line_arg_check(int argc, char *argv[], perf_metrics_t * const metric_info) { + int ch, errors = 0; + extern char *optarg; + + /* check command line args */ + while ((ch = getopt(argc, argv, "e:s:n:w:p:r:l:kbivtC:T:")) != EOF) { + switch (ch) { + case 's': + metric_info->start_len = strtoul(optarg, (char **)NULL, 0); + if ( metric_info->start_len < 1 ) metric_info->start_len = 1; + if(!is_pow_of_2(metric_info->start_len)) { + fprintf(stderr, "Error: start_length must be a power of two\n"); + errors++; + } + if (metric_info->start_len > INT_MAX) { + fprintf(stderr, "Error: start_length is out of integer range\n"); + errors++; + } + break; + case 'e': + metric_info->max_len = strtoul(optarg, (char **)NULL, 0); + if(!is_pow_of_2(metric_info->max_len)) { + fprintf(stderr, "Error: end_length must be a power of two\n"); + errors++; + } + if(metric_info->max_len < metric_info->start_len) { + fprintf(stderr, "Error: end_length (%ld) must be >= " + "start_length (%ld)\n", metric_info->max_len, + metric_info->start_len); + errors++; + } + if (metric_info->max_len > INT_MAX) { + fprintf(stderr, "Error: end_length is out of integer range\n"); + errors++; + } + break; + case 'n': + metric_info->trials = strtoul(optarg, (char **)NULL, 0); + if(metric_info->trials < (metric_info->warmup * 2)) { + fprintf(stderr, "Error: trials (%ld) must be >= 2*warmup " + "(%ld)\n", metric_info->trials, metric_info->warmup * 2); + errors++; + } + break; + case 'p': + metric_info->warmup = strtoul(optarg, (char **)NULL, 0); + if(metric_info->warmup > (metric_info->trials/2)) { + fprintf(stderr, "Error: warmup (%ld) must be <= trials/2 " + "(%ld)\n", metric_info->warmup, metric_info->trials/2); + errors++; + } + break; + case 'k': + metric_info->unit = KB; + if (metric_info->t_type != BW) + errors++; + break; + case 'b': + metric_info->unit = B; + if (metric_info->t_type != BW) + errors++; + break; + case 'v': + metric_info->validate = true; + if(metric_info->t_type == BW && metric_info->target_data) + errors++; + break; + case 'w': + metric_info->window_size = strtoul(optarg, (char **)NULL, 0); + if (metric_info->t_type != BW) { + errors++; + } else { + if (metric_info->target_data) { + errors++; + } + } + break; + case 't': + metric_info->target_data = true; + if (metric_info->t_type != BW) { + errors++; + } else { + if (metric_info->validate) { + errors++; + } + } + break; + case 'r': + metric_info->sztarget = strtoul(optarg, (char **)NULL, 0); + break; + case 'l': + metric_info->szinitiator = strtoul(optarg, (char **)NULL, 0); + break; + case 'C': +#if defined(ENABLE_THREADS) + if (strcmp(optarg, "SINGLE") == 0) { + metric_info->thread_safety = SHMEM_THREAD_SINGLE; + } else if (strcmp(optarg, "FUNNELED") == 0) { + metric_info->thread_safety = SHMEM_THREAD_FUNNELED; + } else if (strcmp(optarg, "SERIALIZED") == 0) { + metric_info->thread_safety = SHMEM_THREAD_SERIALIZED; + } else if (strcmp(optarg, "MULTIPLE") == 0) { + metric_info->thread_safety = SHMEM_THREAD_MULTIPLE; + } else { + fprintf(stderr, "Invalid threading level: \"%s\"\n", optarg); + errors++; + } +#else + fprintf(stderr, "Threading support disabled. " + "Ignoring threading level: \"%s\"\n", optarg); + metric_info->thread_safety = 0; +#endif + break; + case 'T': + metric_info->nthreads = atoi(optarg); + break; + case 'i': + metric_info->individual_report = 1; + break; + default: + errors++; + break; + } + } + + return errors; +} + +static inline +void print_usage(int errors) { + fprintf(stderr, "\nNumber of errors in the command line: %d\n", errors); + fprintf(stderr, "\nUsage: [OPTION]\n" + " -s START_MSG_SIZE Smallest message size. Must be power of 2\n" + " -e END_MSG_SIZE Largest message size. Must be power of 2\n" + " -p WARMUP Number of warmup iterations\n" + " -n TRIALS Number of trial iterations. Must be at\n" + " least twice of WARMUP\n" + " -w WINDOW_SIZE Window size for streaming. Cannot be used\n" + " in conjunction with -t. Specific to band-\n" + " -width experiments\n" + " -k Setting bandwidth metric to kbytes/second\n" + " -b Setting bandwidth metric to bytes/second\n" + " -v Turning on validation of data\n" + " -i Turning on individual process reporting\n" + " -t Output data for target side (default is \n" + " initiator, only use with Put Bandwidth),\n" + " cannot be used in conjunction with \n" + " validate, special sizes used, trials + \n" + " warmup * sizes (8/4KB) <= max length \n" + " -r TARGET_SIZE Number of target nodes, use only with -t;\n" + " -l SOURCE_SIZE Number of initiator nodes, use only with\n" + " -t\n" + " -T THREADS Number of threads\n" + " -C THREAD_LEVEL SHMEM thread level. Possible values: \n" + " SINGLE, FUNNELED, SERIALIZED, MULTIPLE \n" + ); +} + + +#if defined(ENABLE_THREADS) +static +const char *thread_safety_str(perf_metrics_t * const metric_info) { + if (metric_info->thread_safety == SHMEM_THREAD_SINGLE) { + return "SINGLE"; + } else if (metric_info->thread_safety == SHMEM_THREAD_FUNNELED) { + return "FUNNELED"; + } else if (metric_info->thread_safety == SHMEM_THREAD_SERIALIZED) { + return "SERIALIZED"; + } else if (metric_info->thread_safety == SHMEM_THREAD_MULTIPLE) { + return "MULTIPLE"; + } else { + fprintf(stderr, "Unexpected thread safety value: %d. " + "Setting it to SINGLE\n", metric_info->thread_safety); + metric_info->thread_safety = SHMEM_THREAD_SINGLE; + return "SINGLE"; + } +} + +static inline +void thread_safety_validation_check(perf_metrics_t * const metric_info) { + if (metric_info->nthreads == 1) + return; + else { + if (metric_info->thread_safety != SHMEM_THREAD_MULTIPLE) { + if(metric_info->my_node == 0) { + fprintf(stderr, "Warning: argument \"-T %d\" is ignored" + " because of the thread level specified." + " Switching to single thread with thread" + " safety %s\n", metric_info->nthreads, + thread_safety_str(metric_info)); + } + metric_info->nthreads = 1; + } + return; + } +} +#endif + +static inline +int only_even_PEs_check(int my_node, int num_pes) { + if (num_pes % 2 != 0) { + if (my_node == 0) { + fprintf(stderr, "Only even number of processes can be used\n"); + } + return 77; + } else + return 0; +} + + +/* Returns partner node; Assumes only one partner */ +static inline +int partner_node(const perf_metrics_t * const my_info) +{ + if (my_info->num_pes == 1) + return 0; + + if (my_info->t_type == BW) { + if(my_info->cstyle == COMM_PAIRWISE) { + int pairs = my_info->midpt; + + return (my_info->my_node < pairs ? (my_info->my_node + pairs) : + (my_info->my_node - pairs)); + } else { + assert(my_info->cstyle == COMM_INCAST); + return INCAST_PE; + } + } else { + int pairs = my_info->midpt; + + return (my_info->my_node < pairs ? (my_info->my_node + pairs) : + (my_info->my_node - pairs)); + } +} + +static inline +int streaming_node(const perf_metrics_t * const my_info) +{ + if(my_info->cstyle == COMM_PAIRWISE) { + return (my_info->my_node < my_info->szinitiator); + } else { + assert(my_info->cstyle == COMM_INCAST); + return true; + } +} + +static inline +int target_node(const perf_metrics_t * const my_info) +{ + return (my_info->my_node >= my_info->midpt && + (my_info->my_node < (my_info->midpt + my_info->sztarget))); +} + +static inline +int is_streaming_node(const perf_metrics_t * const my_info, int node) +{ + if (my_info->cstyle == COMM_PAIRWISE) { + return (node < my_info->szinitiator); + } else { + assert(my_info->cstyle == COMM_INCAST); + return true; + } +} + +static inline +int check_hostname_validation(const perf_metrics_t * const my_info) { + + int hostname_status = -1; + + /* hostname_size should be a length divisible by 4 */ + int hostname_size = (MAX_HOSTNAME_LEN % 4 == 0) ? MAX_HOSTNAME_LEN : + MAX_HOSTNAME_LEN + (4 - MAX_HOSTNAME_LEN % 4); + int i, errors = 0; + + /* pSync for fcollect of hostnames */ + static long pSync_collect[SHMEM_COLLECT_SYNC_SIZE]; + for (i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++) + pSync_collect[i] = SHMEM_SYNC_VALUE; + + char *hostname = (char *) shmem_malloc (hostname_size * sizeof(char)); + char *dest = (char *) shmem_malloc (my_info->num_pes * hostname_size * + sizeof(char)); + + if (hostname == NULL || dest == NULL) { + fprintf(stderr, "shmem_malloc failed to allocate for hostname strings\n"); + return -1; + } + + hostname_status = gethostname(hostname, hostname_size); + if (hostname_status != 0) { + fprintf(stderr, "gethostname failed (%d)\n", hostname_status); + return -1; + } + shmem_barrier_all(); + + /* nelems needs to be updated based on 32-bit API */ + shmem_fcollect32(dest, hostname, hostname_size/4, 0, 0, my_info->num_pes, + pSync_collect); + + char *snode_name = NULL; + char *tnode_name = NULL; + for (i = 0; i < my_info->num_pes; i++) { + char *curr_name = &dest[i * hostname_size]; + + if (is_streaming_node(my_info, i)) { + if (snode_name == NULL) { + snode_name = curr_name; + } + + if (strncmp(snode_name, curr_name, hostname_size) != 0) { + fprintf(stderr, "PE %d on %s is a streaming node " + "but not placed on %s\n", i, curr_name, + snode_name); + errors++; + } + } else { + if (tnode_name == NULL) { + tnode_name = curr_name; + } + + if (strncmp(tnode_name, curr_name, hostname_size) != 0) { + fprintf(stderr, "PE %d on %s is a target node " + "but not placed on %s\n", i, curr_name, + tnode_name); + errors++; + } + } + } + + if (snode_name == NULL || tnode_name == NULL) { + fprintf(stderr, "Error: no streaming or target node\n"); + return -1; + } + + if (strncmp(snode_name, tnode_name, hostname_size) == 0) { + fprintf(stderr, "Warning: senders and receivers are running on the " + "same node %s\n", snode_name); + } + + shmem_free(dest); + shmem_free(hostname); + + return errors; +} + +static +int error_checking_init_target_usage(perf_metrics_t * const metric_info) { + int error = false; + assert(metric_info->midpt > 0); + + if (metric_info->sztarget != -1 && metric_info->szinitiator == -1) { + if (metric_info->sztarget < 1 || + metric_info->sztarget > metric_info->midpt || + !metric_info->target_data) { + error = true; + } else { + metric_info->szinitiator = metric_info->midpt; + } + } else if (metric_info->sztarget == -1 && metric_info->szinitiator != -1) { + if( metric_info->szinitiator < 1 || + metric_info->szinitiator > metric_info->midpt || + !metric_info->target_data) { + error = true; + } else { + metric_info->sztarget = metric_info->midpt; + } + } else if (metric_info->sztarget == -1 && metric_info->szinitiator == -1) { + metric_info->szinitiator = metric_info->midpt; + metric_info->sztarget = metric_info->midpt; + } else { + if (!metric_info->target_data) { + error = true; + } + } + + if (error) { + fprintf(stderr, "Invalid usage of command line arg -r/-l\n"); + return -1; + } + return 0; +} + +static inline +void large_message_metric_chg(perf_metrics_t * const metric_info, int len) { + if(len > LARGE_MESSAGE_SIZE) { + metric_info->window_size = WINDOW_SIZE_LARGE; + metric_info->trials = TRIALS_LARGE; + metric_info->warmup = WARMUP_LARGE; + } +} + +/* put/get bw use opposite streaming/validate nodes */ +static inline +red_PE_set validation_set(perf_metrics_t * const my_info, int *nPEs) +{ + if(my_info->cstyle == COMM_PAIRWISE) { + if(streaming_node(my_info)) { + *nPEs = my_info->szinitiator; + return FIRST_HALF; + } else if(target_node(my_info)) { + *nPEs = my_info->sztarget; + return SECOND_HALF; + } else { + fprintf(stderr, "Warning: you are getting data from a node that " + "wasn't a part of the perf set \n "); + return 0; + } + } else { + assert(my_info->cstyle == COMM_INCAST); + *nPEs = my_info->num_pes; + return FULL_SET; + } +} + +/* reduction to collect performance results from PE set + * then start_pe will print results --- assumes num_pes is even */ +static inline +void PE_set_used_adjustments(int *nPEs, int *stride, int *start_pe, + perf_metrics_t * const my_info) { + red_PE_set PE_set = validation_set(my_info, nPEs); + + if(PE_set == FIRST_HALF || PE_set == FULL_SET) { + *start_pe = 0; + } + else { + assert(PE_set == SECOND_HALF); + *start_pe = my_info->midpt; + } + + *stride = 0; /* back to back PEs */ +} + +static +void print_header(perf_metrics_t * const metric_info) { + printf("\n%20sSandia OpenSHMEM Performance Suite%20s\n", " ", " "); + printf("%20s==================================%20s\n", " ", " "); + printf("Total Number of PEs: %10d%6sWindow size: %10lu\n", + metric_info->num_pes, " ", metric_info->window_size); + printf("Number of source PEs: %10d%6sMaximum message size: %10lu\n", + metric_info->szinitiator, " ", metric_info->max_len); + printf("Number of target PEs: %10d%6sNumber of threads: %10d\n", + metric_info->sztarget, " ", metric_info->nthreads); + printf("Iteration count: %10lu%6s", metric_info->trials, " "); +#if defined(ENABLE_THREADS) + printf("Thread safety: %10s\n", thread_safety_str(metric_info)); +#endif + printf("\n"); } diff --git a/test/performance/shmem_perf_suite/int_element_latency.h b/test/performance/shmem_perf_suite/int_element_latency.h index 45e857e..a32cab7 100644 --- a/test/performance/shmem_perf_suite/int_element_latency.h +++ b/test/performance/shmem_perf_suite/int_element_latency.h @@ -25,69 +25,101 @@ * SOFTWARE. */ -void static inline -int_p_latency(perf_metrics_t data) +static inline +void int_p_latency(perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; unsigned int i = 0; + int dest = partner_node(metric_info); + int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } - if (data.my_node == PUT_IO_NODE) { - printf("\nStream shmem_int_p results:\n"); - print_results_header(); + if (metric_info->my_node == 0) { + printf("\nshmem_int_p results:\n"); + print_latency_header(); } + shmem_barrier_all(); - /*puts to zero to match gets validation scheme*/ - if (data.my_node == PUT_IO_NODE) { + /* puts to zero to match gets validation scheme */ + if (sender) { - for (i = 0; i < data.trials + data.warmup; i++) { - if(i == data.warmup) + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - shmem_int_p((int*) data.dest, data.my_node, 0); + shmem_int_p((int*) metric_info->dest, metric_info->my_node, dest); shmem_quiet(); } end = perf_shmemx_wtime(); - calc_and_print_results(start, end, sizeof(int), data); + calc_and_print_results(start, end, sizeof(int), metric_info); } shmem_barrier_all(); - if((data.my_node == 0) && data.validate) - validate_recv(data.dest, sizeof(int), partner_node(data.my_node)); + if(!sender && metric_info->validate) + validate_recv(metric_info->dest, sizeof(int), dest); } /* latency/bw for one-way trip */ -void static inline -int_g_latency(perf_metrics_t data) +static inline +void int_g_latency(perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; unsigned int i = 0; int rtnd = -1; + int dest = partner_node(metric_info); + int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } - if (data.my_node == GET_IO_NODE) { - printf("\nStream shmem_int_g results:\n"); - print_results_header(); + if (metric_info->my_node == 0) { + printf("\nshmem_int_g results:\n"); + print_latency_header(); } + shmem_barrier_all(); - if (data.my_node == GET_IO_NODE) { + if (receiver) { - for (i = 0; i < data.trials + data.warmup; i++) { - if(i == data.warmup) + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - rtnd = shmem_int_g((int*) data.src, 1); + rtnd = shmem_int_g((int*) metric_info->src, dest); } end = perf_shmemx_wtime(); - calc_and_print_results(start, end, sizeof(int), data); + calc_and_print_results(start, end, sizeof(int), metric_info); } shmem_barrier_all(); - if((data.my_node == 0) && data.validate) - validate_recv((char*) &rtnd, sizeof(int), partner_node(data.my_node)); + if(receiver && metric_info->validate) + validate_recv((char*) &rtnd, sizeof(int), dest); } diff --git a/test/performance/shmem_perf_suite/latency_common.h b/test/performance/shmem_perf_suite/latency_common.h index 482b47c..3ba7b78 100644 --- a/test/performance/shmem_perf_suite/latency_common.h +++ b/test/performance/shmem_perf_suite/latency_common.h @@ -26,116 +26,64 @@ */ #include +#ifdef ENABLE_OPENMP +#include +#endif -#define PUT_IO_NODE 1 -#define GET_IO_NODE !PUT_IO_NODE #define INIT_VALUE 1 -#define MAX_MSG_SIZE (1<<23) -#define START_LEN 1 - -#define INC 2 -#define TRIALS 100 -#define WARMUP 10 - -typedef struct perf_metrics { - unsigned int start_len, max_len; - unsigned int inc, trials; - unsigned int warmup; - int validate; - int my_node, npes; - long * target; - char * src, *dest; -} perf_metrics_t; - -void static data_init(perf_metrics_t * data) { - data->start_len = START_LEN; - data->max_len = MAX_MSG_SIZE; - data->inc = INC; - data->trials = TRIALS; - data->warmup = WARMUP; /*number of initial iterations to skip*/ - data->validate = false; - data->my_node = shmem_my_pe(); - data->npes = shmem_n_pes(); - data->target = NULL; - data->src = NULL; - data->dest = NULL; -} - -void static inline print_results_header(void) { - printf("\nLength Latency \n"); - printf("in bytes in micro seconds \n"); -} - -/*not storing results, only outputing it*/ -void static inline calc_and_print_results(double start, double end, int len, - perf_metrics_t data) { - double latency = 0.0; - latency = (end - start) / data.trials; - - printf("%9d %8.2f \n", len, latency); +static +void init_metrics(perf_metrics_t * const metric_info) { + metric_info->t_type = LAT; + set_metric_defaults(metric_info); + metric_info->target = NULL; + metric_info->cstyle = COMM_PAIRWISE; + metric_info->opstyle = STYLE_RMA; } -int static inline partner_node(int my_node) -{ - return ((my_node % 2 == 0) ? (my_node + 1) : (my_node - 1)); +static inline +void print_latency_header(void) { + printf("\nMessage Size%15sLatency\n", " "); + printf("%4sin bytes%17sin us\n", " ", " "); } -void static inline command_line_arg_check(int argc, char *argv[], - perf_metrics_t *metric_info) { - int ch, error = false; - extern char *optarg; - - /* check command line args */ - while ((ch = getopt(argc, argv, "e:s:n:v")) != EOF) { - switch (ch) { - case 's': - metric_info->start_len = strtol(optarg, (char **)NULL, 0); - if ( metric_info->start_len < 1 ) metric_info->start_len = 1; - if(!is_pow_of_2(metric_info->start_len)) error = true; - break; - case 'e': - metric_info->max_len = strtol(optarg, (char **)NULL, 0); - if(!is_pow_of_2(metric_info->max_len)) error = true; - if(metric_info->max_len < metric_info->start_len) error = true; - break; - case 'n': - metric_info->trials = strtol(optarg, (char **)NULL, 0); - if(metric_info->trials <= (metric_info->warmup*2)) error = true; - break; - case 'v': - metric_info->validate = true; - break; - default: - error = true; - break; - } +/* calculation and printing of the latency */ +static inline +void calc_and_print_results(double start, double end, int len, + perf_metrics_t * const metric_info) { + int stride = 0, start_pe = 0, nPEs = 0; + int nred_elements = 1; + static double latency = 0.0, avg_latency = 0.0; + static double pwrk[SHMEM_REDUCE_MIN_WRKDATA_SIZE]; + + PE_set_used_adjustments(&nPEs, &stride, &start_pe, metric_info); + + if (end > 0 && start > 0 && (end - start) > 0) { + latency = (end - start) / metric_info->trials; + } else { + fprintf(stderr, "Incorrect time measured from latency test: " + "start = %lf, end = %lf\n", start, end); } - if (error) { - if (metric_info->my_node == 0) { - fprintf(stderr, "Usage: [-s start_length] [-e end_length] "\ - ": lengths must be a power of two \n " \ - "[-n trials (must be greater than 20)] "\ - "[-v (validate results)]\n"); - } -#ifndef VERSION_1_0 - shmem_finalize(); -#endif - exit (-1); + if (metric_info->individual_report == 1) { + printf("Individual latency for PE %6d is %10.2f\n", + metric_info->my_node, latency); + } + shmem_barrier(start_pe, stride, nPEs, bar_psync); + + if (nPEs >= 2) { + shmem_double_sum_to_all(&avg_latency, &latency, + nred_elements, start_pe, stride, + nPEs, pwrk, red_psync); + avg_latency /= nPEs; + } else { + avg_latency = latency; } -} -void static inline only_two_PEs_check(int my_node, int num_pes) { - if (num_pes != 2) { - if (my_node == 0) { - fprintf(stderr, "2-nodes only test\n"); - } -#ifndef VERSION_1_0 - shmem_finalize(); -#endif - exit(77); + if (metric_info->my_node == start_pe) { + printf("%2s%10d%12s%10.2f\n", " ", len, " ", avg_latency); } + } /**************************************************************/ @@ -144,31 +92,39 @@ void static inline only_two_PEs_check(int my_node, int num_pes) { /*have single symmetric long element "target" from perf_metrics_t * that needs to be initialized in function*/ -extern void long_element_round_trip_latency(perf_metrics_t data); +extern void long_element_round_trip_latency(perf_metrics_t *data); -extern void int_element_latency(perf_metrics_t data); +extern void int_element_latency(perf_metrics_t *data); /*have symmetric buffers src/dest from perf_metrics_t * that has been initialized to my_node number */ extern void streaming_latency(int len, perf_metrics_t *data); -void static inline multi_size_latency(perf_metrics_t data, char *argv[]) { +static inline +void multi_size_latency(perf_metrics_t * const data, char *argv[]) { unsigned int len; - int partner_pe = partner_node(data.my_node); + int partner_pe = partner_node(data); - for (len = data.start_len; len <= data.max_len; len *= data.inc) { - - shmem_barrier_all(); - - streaming_latency(len, &data); + if (data->my_node == 0) { + print_latency_header(); + } - shmem_barrier_all(); + for (len = data->start_len; len <= data->max_len; len *= data->size_inc) { + large_message_metric_chg(data, len); + streaming_latency(len, data); } shmem_barrier_all(); - if((data.my_node == 0) && data.validate) - validate_recv(data.dest, data.max_len, partner_pe); + if (data->validate) { + int errors = -1; + if ((streaming_node(data) && data->opstyle == STYLE_GET) || + (target_node(data) && data->opstyle == STYLE_PUT)) + errors = validate_recv(data->dest, data->max_len, partner_pe); + + if (errors >= 0) + printf("Validation complete (%d errors)\n", errors); + } } @@ -177,58 +133,124 @@ void static inline multi_size_latency(perf_metrics_t data, char *argv[]) { /* INIT and teardown of resources */ /**************************************************************/ -void static inline latency_init_resources(int argc, char *argv[], - perf_metrics_t *data) { +static inline +int latency_init_resources(int argc, char *argv[], + perf_metrics_t * const metric_info) { + init_metrics(metric_info); + int ret = command_line_arg_check(argc, argv, metric_info); + #ifndef VERSION_1_0 +#if defined(ENABLE_THREADS) + int tl; + shmem_init_thread(metric_info->thread_safety, &tl); + if(tl != metric_info->thread_safety) { + fprintf(stderr,"Could not initialize with requested thread " + "level %d: got %d\n", metric_info->thread_safety, tl); + return -1; + } +#else shmem_init(); +#endif #else start_pes(0); #endif - data_init(data); + update_metrics(metric_info); - only_two_PEs_check(data->my_node, data->npes); + if (ret) { + if (metric_info->my_node == 0) { + print_usage(ret); + } + return -1; + } else { + if (metric_info->num_pes < 2) { + fprintf(stderr, "This test requires at least two processes.\n"); + print_usage(1); + return -1; + } + } - command_line_arg_check(argc, argv, data); + if (error_checking_init_target_usage(metric_info) == -1) + return -1; +#if defined(ENABLE_THREADS) + thread_safety_validation_check(metric_info); +#endif + init_psync_arrays(); - data->src = aligned_buffer_alloc(data->max_len); - init_array(data->src, data->max_len, data->my_node); + if(only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) { + return -1; + } - data->dest = aligned_buffer_alloc(data->max_len); - init_array(data->dest, data->max_len, data->my_node); + metric_info->src = aligned_buffer_alloc(metric_info->max_len); + init_array(metric_info->src, metric_info->max_len, metric_info->my_node); + + metric_info->dest = aligned_buffer_alloc(metric_info->max_len); + init_array(metric_info->dest, metric_info->max_len, metric_info->my_node); #ifndef VERSION_1_0 - data->target = shmem_malloc(sizeof(long)); + metric_info->target = shmem_malloc(sizeof(long)); #else - data->target = shmalloc(sizeof(long)); + metric_info->target = shmalloc(sizeof(long)); #endif + + return 0; } -void static inline latency_free_resources(perf_metrics_t *data) { +static inline +void latency_free_resources(const perf_metrics_t * const metric_info) { shmem_barrier_all(); #ifndef VERSION_1_0 - shmem_free(data->target); + shmem_free(metric_info->target); #else - shfree(data->target); + shfree(metric_info->target); #endif - aligned_buffer_free(data->src); - aligned_buffer_free(data->dest); + aligned_buffer_free(metric_info->src); + aligned_buffer_free(metric_info->dest); +} + +static inline +void latency_finalize(void) { #ifndef VERSION_1_0 shmem_finalize(); #endif } -void static inline latency_main(int argc, char *argv[]) { - perf_metrics_t data; +static inline +void latency_main(int argc, char *argv[], op_style opstyle) { + perf_metrics_t metric_info; + + int ret = latency_init_resources(argc, argv, &metric_info); + metric_info.opstyle = opstyle; + + if (ret == 0) { + if (metric_info.my_node == 0) { + print_header(&metric_info); + } + long_element_round_trip_latency(&metric_info); + int_element_latency(&metric_info); + multi_size_latency(&metric_info, argv); + latency_free_resources(&metric_info); + } - latency_init_resources(argc, argv, &data); + latency_finalize(); +} - long_element_round_trip_latency(data); +static inline +void latency_main_ctx(int argc, char *argv[], op_style opstyle) { + perf_metrics_t metric_info; - int_element_latency(data); + int ret = latency_init_resources(argc, argv, &metric_info); + metric_info.opstyle = opstyle; - multi_size_latency(data, argv); + if (ret == 0) { + if (metric_info.my_node == 0) { + print_header(&metric_info); + } + multi_size_latency(&metric_info, argv); + latency_free_resources(&metric_info); + } - latency_free_resources(&data); + latency_finalize(); } + diff --git a/test/performance/shmem_perf_suite/latency_ctx.h b/test/performance/shmem_perf_suite/latency_ctx.h new file mode 100644 index 0000000..115e9d2 --- /dev/null +++ b/test/performance/shmem_perf_suite/latency_ctx.h @@ -0,0 +1,190 @@ +/* +* Copyright (c) 2018 Intel Corporation. All rights reserved. +* This software is available to you under the BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +static inline +void streaming_put_latency_ctx(int len, perf_metrics_t *metric_info, int streaming_node) +{ + double start = 0.0, end = 0.0; + unsigned long int i; + int dest = partner_node(metric_info); + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same " + "process (%d)\n", dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } + + shmem_barrier_all(); + + if (streaming_node) { +#pragma omp parallel default(none) firstprivate(len, dest) private(i) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) + { + const int thread_id = omp_get_thread_num(); + shmem_ctx_t ctx; + shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); + + for (i = 0; i < metric_info->warmup; i++) { +#ifdef USE_NONBLOCKING_API + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#else + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#endif + shmem_ctx_quiet(ctx); + } + shmem_ctx_destroy(ctx); + } + } + + shmem_barrier_all(); + if (streaming_node) { +#pragma omp parallel default(none) firstprivate(len, dest) private(i) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) + { + const int thread_id = omp_get_thread_num(); + shmem_ctx_t ctx; + shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); + +#pragma omp barrier +#pragma omp master + { + start = perf_shmemx_wtime(); + } + + for (i = 0; i < metric_info->trials; i++) { +#ifdef USE_NONBLOCKING_API + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#else + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#endif + shmem_ctx_quiet(ctx); + } + shmem_ctx_destroy(ctx); + } + } + + shmem_barrier_all(); + if (streaming_node) { + end = perf_shmemx_wtime(); + calc_and_print_results(start, end, len, metric_info); + } + + shmem_barrier_all(); +} + +static inline +void streaming_get_latency_ctx(int len, perf_metrics_t *metric_info, int streaming_node) +{ + double start = 0.0, end = 0.0; + unsigned long int i; + int dest = partner_node(metric_info); + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same " + "process (%d)\n", dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } + + shmem_barrier_all(); + + if (streaming_node) { +#pragma omp parallel default(none) firstprivate(len, dest) private(i) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) + { + const int thread_id = omp_get_thread_num(); + shmem_ctx_t ctx; + shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); + + for (i = 0; i < metric_info->warmup; i++) { +#ifdef USE_NONBLOCKING_API + shmem_ctx_getmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); + shmem_ctx_quiet(ctx); +#else + shmem_ctx_getmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#endif + } + shmem_ctx_destroy(ctx); + } + } + + shmem_barrier_all(); + if (streaming_node) { +#pragma omp parallel default(none) firstprivate(len, dest) private(i) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) + { + const int thread_id = omp_get_thread_num(); + shmem_ctx_t ctx; + shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); + +#pragma omp barrier +#pragma omp master + { + start = perf_shmemx_wtime(); + } + + for (i = 0; i < metric_info->trials; i++) { +#ifdef USE_NONBLOCKING_API + shmem_ctx_getmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); + shmem_ctx_quiet(ctx); +#else + shmem_ctx_getmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); +#endif + } + shmem_ctx_destroy(ctx); + } + } + + shmem_barrier_all(); + if (streaming_node) { + end = perf_shmemx_wtime(); + calc_and_print_results(start, end, len, metric_info); + } + + shmem_barrier_all(); +} diff --git a/test/performance/shmem_perf_suite/round_t_latency.h b/test/performance/shmem_perf_suite/round_t_latency.h index 6b3d86e..4910e2d 100644 --- a/test/performance/shmem_perf_suite/round_t_latency.h +++ b/test/performance/shmem_perf_suite/round_t_latency.h @@ -25,79 +25,103 @@ * SOFTWARE. */ -void static inline -long_element_round_trip_latency_get(perf_metrics_t data) +static inline +void long_element_round_trip_latency_get(perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; - int dest = 1; - int partner_pe = partner_node(data.my_node); - *data.target = data.my_node; + int dest = partner_node(metric_info); + int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; + *metric_info->target = metric_info->my_node; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } - if (data.my_node == GET_IO_NODE) { - printf("\nshmem_long_g results:\n"); - print_results_header(); + if (metric_info->my_node == 0) { + printf("shmem_long_g results:\n"); + print_latency_header(); } shmem_barrier_all(); - if (data.my_node == GET_IO_NODE) { + if (receiver) { unsigned int i; - for (i = 0; i < data.trials + data.warmup; i++) { - if(i == data.warmup) + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - *data.target = shmem_long_g(data.target, dest); + *metric_info->target = shmem_long_g(metric_info->target, dest); } end = perf_shmemx_wtime(); - calc_and_print_results(start, end, sizeof(long), data); + calc_and_print_results(start, end, sizeof(long), metric_info); - if(data.validate) { - if(*data.target != partner_pe) + if(metric_info->validate) { + if(*metric_info->target != dest) printf("validation error shmem_long_g target = %ld != %d\n", - *data.target, partner_pe); + *metric_info->target, dest); } } } /*gauge small get pathway round trip latency*/ -void static inline -long_element_round_trip_latency_put(perf_metrics_t data) +static inline +void long_element_round_trip_latency_put(perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; long tmp; - int dest = (data.my_node + 1) % data.npes; + int dest = partner_node(metric_info); + int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; unsigned int i; - tmp = *data.target = INIT_VALUE; + tmp = *metric_info->target = INIT_VALUE; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; + } - if (data.my_node == PUT_IO_NODE) { - printf("\nPing-Pong shmem_long_p results:\n"); - print_results_header(); + if (metric_info->my_node == 0) { + printf("Ping-Pong shmem_long_p results:\n"); + print_latency_header(); } shmem_barrier_all(); - if (data.my_node == PUT_IO_NODE) { - for (i = 0; i < data.trials + data.warmup; i++) { - if(i == data.warmup) + if (sender) { + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - shmem_long_p(data.target, ++tmp, dest); - - shmem_long_wait_until(data.target, SHMEM_CMP_EQ, tmp); + shmem_long_p(metric_info->target, ++tmp, dest); + shmem_long_wait_until(metric_info->target, SHMEM_CMP_EQ, tmp); } end = perf_shmemx_wtime(); - - data.trials = data.trials*2; /*output half to get single round trip time*/ - calc_and_print_results(start, end, sizeof(long), data); + metric_info->trials = metric_info->trials * 2; /*output half to get single round trip time*/ + calc_and_print_results(start, end, sizeof(long), metric_info); } else { - for (i = 0; i < data.trials + data.warmup; i++) { - shmem_long_wait_until(data.target, SHMEM_CMP_EQ, ++tmp); - - shmem_long_p(data.target, tmp, dest); + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + shmem_long_wait_until(metric_info->target, SHMEM_CMP_EQ, ++tmp); + shmem_long_p(metric_info->target, tmp, dest); } } -} /*gauge small put pathway round trip latency*/ +} /* gauge small put pathway round trip latency */ diff --git a/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c index f287646..05cebd8 100644 --- a/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c @@ -27,13 +27,12 @@ /* ** -** This is a bandwidth centric test for put: back-to-back message rate +** This is a bandwidth centric test for atomic operations ** ** Features of Test: bi-directional bandwidth ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include @@ -42,14 +41,85 @@ do { \ double start = 0.0, end = 0.0; \ unsigned long int i = 0, j = 0, num_itr = metric_info->trials + metric_info->warmup; \ - int dest = partner_node(*metric_info); \ + int dest = partner_node(metric_info); \ shmem_barrier_all(); \ \ switch(op) { \ + case OP_SET: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_set( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_AND: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_OR: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_or( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_XOR: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ case OP_ADD: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_add( \ @@ -62,8 +132,11 @@ break; \ case OP_INC: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_inc( \ @@ -74,10 +147,73 @@ if(snode) \ end = perf_shmemx_wtime(); \ break; \ + case OP_FETCH: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch( \ + (TYPE *)(metric_info->dest), dest); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_FAND: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_FOR: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_or( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ + case OP_FXOR: \ + for(i = 0; i < num_itr; i++) { \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ + \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + if(snode) \ + end = perf_shmemx_wtime(); \ + break; \ case OP_FADD: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_fetch_add( \ @@ -88,8 +224,11 @@ break; \ case OP_FINC: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_fetch_inc( \ @@ -100,8 +239,11 @@ break; \ case OP_SWAP: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_swap( \ @@ -112,8 +254,11 @@ break; \ case OP_CSWAP: \ for(i = 0; i < num_itr; i++) { \ - if(snode && i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + if (i == metric_info->warmup) { \ + shmem_barrier_all(); \ + if (snode) \ + start = perf_shmemx_wtime(); \ + } \ \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_compare_swap( \ @@ -128,69 +273,62 @@ break; \ } \ if(snode) \ - calc_and_print_results(end, start, len, *metric_info); \ + calc_and_print_results(end, start, len, metric_info); \ } while(0) #define NUM_INC 100 -typedef enum { - OP_ADD, - OP_INC, - OP_FADD, - OP_FINC, - OP_SWAP, - OP_CSWAP, - SIZE_OF_OP -} atomic_op_type; +static const char * atomic_op_names [] = { "fetch", "set", "cswap", "swap", "finc", "inc", + "fadd", "add", "fand", "and", "for", "or", + "fxor", "xor" }; -static const char * op_names [] = { "add", "inc", "fadd", "finc", "swap", "cswap" }; -static inline void bw_set_metric_info_len(perf_metrics_t *metric_info) +static inline void bw_set_metric_info_len(perf_metrics_t * const metric_info) { - unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(int), sizeof(long), - sizeof(long long)}; - int snode = streaming_node(*metric_info); - atomic_op_type op_type = OP_ADD; - metric_info->type = BI_DIR; - metric_info->bwstyle = STYLE_ATOMIC; - - for(op_type = OP_ADD; op_type < SIZE_OF_OP; op_type++) { - if(metric_info->my_node == 0 && op_type != OP_ADD) - printf("\nshmem_%s\n", op_names[op_type]); + unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(unsigned int), sizeof(unsigned long), + sizeof(unsigned long long)}; + metric_info->b_type = BI_DIR; + int snode = streaming_node(metric_info); + atomic_op_type op_type = OP_FETCH; + for(op_type = OP_FETCH; op_type < SIZE_OF_OP; op_type++) { + if(metric_info->my_node == 0) { + printf("\nshmem_%s\n", atomic_op_names[op_type]); + printf("-----------\n"); + } metric_info->start_len = atomic_sizes[0]; metric_info->max_len = atomic_sizes[0]; metric_info->size_inc = NUM_INC; shmem_barrier_all(); - bi_bw(atomic_sizes[0], metric_info, snode, int, int, op_type); + bi_bw(atomic_sizes[0], metric_info, snode, uint, unsigned int, op_type); metric_info->start_len = atomic_sizes[1]; metric_info->max_len = atomic_sizes[1]; shmem_barrier_all(); - bi_bw(atomic_sizes[1], metric_info, snode, long, long, op_type); + bi_bw(atomic_sizes[1], metric_info, snode, ulong, unsigned long, op_type); metric_info->start_len = atomic_sizes[2]; metric_info->max_len = atomic_sizes[2]; shmem_barrier_all(); - bi_bw(atomic_sizes[2], metric_info, snode, longlong, long long, op_type); + bi_bw(atomic_sizes[2], metric_info, snode, ulonglong, unsigned long long, op_type); } } -void bi_dir_bw(int len, perf_metrics_t *metric_info) +void bi_dir_bw(int len, perf_metrics_t * const metric_info) { bw_set_metric_info_len(metric_info); } int main(int argc, char *argv[]) { - bi_dir_bw_main(argc, argv); + bi_dir_bw_main(argc, argv, STYLE_ATOMIC); return 0; } diff --git a/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c index df82042..bdf0f17 100644 --- a/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c @@ -33,7 +33,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include @@ -41,13 +40,13 @@ int main(int argc, char *argv[]) { - bi_dir_bw_main(argc,argv); + bi_dir_bw_main(argc, argv, STYLE_GET); return 0; } /* end of main() */ void -bi_dir_bw(int len, perf_metrics_t *metric_info) +bi_dir_bw(int len, perf_metrics_t * const metric_info) { bi_bw_get(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c index 4ddbed2..a6b6564 100644 --- a/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c @@ -33,7 +33,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #define ENABLE_OPENMP @@ -43,13 +42,13 @@ int main(int argc, char *argv[]) { - bi_dir_bw_main(argc, argv); + bi_dir_bw_main(argc, argv, STYLE_PUT); return 0; } void -bi_dir_bw(int len, perf_metrics_t *metric_info) +bi_dir_bw(int len, perf_metrics_t * const metric_info) { bi_bw_ctx(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c index 7f5589a..279536a 100644 --- a/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c @@ -33,7 +33,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include @@ -41,13 +40,13 @@ int main(int argc, char *argv[]) { - bi_dir_bw_main(argc, argv); + bi_dir_bw_main(argc, argv, STYLE_PUT); return 0; } void -bi_dir_bw(int len, perf_metrics_t *metric_info) +bi_dir_bw(int len, perf_metrics_t * const metric_info) { bi_bw_put(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c b/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c index 79cd499..d48208b 100644 --- a/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c @@ -27,135 +27,345 @@ /* ** -** This is a bandwidth centric test for put: back-to-back message rate +** This is a bandwidth centric test for atomic operations ** ** Features of Test: uni-directional bandwidth ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include #define ATOMIC_COMM_STYLE COMM_INCAST -#define uni_bw(len, metric_info, snode, NAME, TYPE, op) \ +#define uni_bw(len, metric_info, snode, NAME, TYPE, op) \ do { \ double start = 0.0, end = 0.0; \ - unsigned long int i = 0, j = 0, num_itr = metric_info->trials + metric_info->warmup; \ - int dest = partner_node(*metric_info); \ + unsigned long int i = 0, j = 0; \ + int dest = partner_node(metric_info); \ shmem_barrier_all(); \ \ - if(snode) { \ - switch(op) { \ - case OP_ADD: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + switch(op) { \ + case OP_SET: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_set( \ + (TYPE *)(metric_info->dest), ONE, dest); \ \ + shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ - shmem_##NAME##_atomic_add( \ + shmem_##NAME##_atomic_set( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_AND: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_OR: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_or( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_or( \ (TYPE *)(metric_info->dest), ONE, dest); \ \ shmem_quiet(); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_XOR: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ \ + shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ } \ end = perf_shmemx_wtime(); \ - break; \ - case OP_INC: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_ADD: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_add( \ + (TYPE *)(metric_info->dest), ONE, dest); \ \ + shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_add( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + \ + shmem_quiet(); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_INC: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_inc( \ (TYPE *)(metric_info->dest), dest); \ \ shmem_quiet(); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_inc( \ + (TYPE *)(metric_info->dest), dest); \ \ + shmem_quiet(); \ } \ end = perf_shmemx_wtime(); \ - break; \ - case OP_FADD: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ - \ + } \ + break; \ + case OP_FETCH: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch( \ + (TYPE *)(metric_info->dest), dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch( \ + (TYPE *)(metric_info->dest), dest); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_FAND: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_and( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_FOR: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_or( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_or( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_FXOR: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_xor( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + end = perf_shmemx_wtime(); \ + } \ + break; \ + case OP_FADD: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_add( \ + (TYPE *)(metric_info->dest), ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_fetch_add( \ (TYPE *)(metric_info->dest), ONE, dest); \ } \ end = perf_shmemx_wtime(); \ - break; \ - case OP_FINC: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ - \ + } \ + break; \ + case OP_FINC: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_fetch_inc( \ + (TYPE *)(metric_info->dest), dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_fetch_inc( \ (TYPE *)(metric_info->dest), dest); \ } \ end = perf_shmemx_wtime(); \ - break; \ - case OP_SWAP: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ - \ + } \ + break; \ + case OP_SWAP: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_swap( \ + (TYPE *)(metric_info->src), ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_swap( \ (TYPE *)(metric_info->src), ONE, dest); \ } \ end = perf_shmemx_wtime(); \ - break; \ - case OP_CSWAP: \ - for(i = 0; i < num_itr; i++) { \ - if(i == metric_info->warmup) \ - start = perf_shmemx_wtime(); \ - \ + } \ + break; \ + case OP_CSWAP: \ + if(snode) { \ + for(i = 0; i < metric_info->warmup; i++) { \ + for(j = 0; j < metric_info->window_size; j++) \ + shmem_##NAME##_atomic_compare_swap( \ + (TYPE *)(metric_info->src), dest, ONE, dest); \ + } \ + } \ + shmem_barrier_all(); \ + if(snode) { \ + start = perf_shmemx_wtime(); \ + for(i = 0; i < metric_info->trials; i++) { \ for(j = 0; j < metric_info->window_size; j++) \ shmem_##NAME##_atomic_compare_swap( \ (TYPE *)(metric_info->src), dest, ONE, dest); \ } \ end = perf_shmemx_wtime(); \ - break; \ - default: \ - fprintf(stderr, "Error %d not a valid op case \ + } \ + break; \ + default: \ + fprintf(stderr, "Error %d not a valid op case \ for atomics\n", op); \ - break; \ - } \ - calc_and_print_results(end, start, len, *metric_info); \ + break; \ + } \ + if(snode) { \ + calc_and_print_results(end, start, len, metric_info); \ } \ } while(0) #define NUM_INC 100 +static const char * atomic_op_names [] = { "fetch", "set", "cswap", "swap", "finc", "inc", + "fadd", "add", "fand", "and", "for", "or", + "fxor", "xor" }; -typedef enum { - OP_ADD, - OP_INC, - OP_FADD, - OP_FINC, - OP_SWAP, - OP_CSWAP, - SIZE_OF_OP -} atomic_op_type; - -static const char * op_names [] = { "add", "inc", "fadd", "finc", "swap", "cswap" }; -static inline void bw_set_metric_info_len(perf_metrics_t *metric_info) +static inline void bw_set_metric_info_len(perf_metrics_t * const metric_info) { - unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(int), sizeof(long), - sizeof(long long)}; - metric_info->cstyle = ATOMIC_COMM_STYLE; - metric_info->type = UNI_DIR; - int snode = streaming_node(*metric_info); - atomic_op_type op_type = OP_ADD; + unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(unsigned int), sizeof(unsigned long), + sizeof(unsigned long long)}; + metric_info->b_type = UNI_DIR; + int snode = streaming_node(metric_info); + atomic_op_type op_type = OP_FETCH; - for(op_type = OP_ADD; op_type < SIZE_OF_OP; op_type++) { - if(metric_info->my_node == 0) - printf("\nshmem_%s\n", op_names[op_type]); + for(op_type = OP_FETCH; op_type < SIZE_OF_OP; op_type++) { + if(metric_info->my_node == 0) { + printf("\nshmem_%s\n", atomic_op_names[op_type]); + printf("-----------\n"); + } metric_info->start_len = atomic_sizes[0]; metric_info->max_len = atomic_sizes[0]; @@ -163,25 +373,25 @@ static inline void bw_set_metric_info_len(perf_metrics_t *metric_info) shmem_barrier_all(); - uni_bw(atomic_sizes[0], metric_info, snode, int, int, op_type); + uni_bw(atomic_sizes[0], metric_info, snode, uint, unsigned int, op_type); metric_info->start_len = atomic_sizes[1]; metric_info->max_len = atomic_sizes[1]; shmem_barrier_all(); - uni_bw(atomic_sizes[1], metric_info, snode, long, long, op_type); + uni_bw(atomic_sizes[1], metric_info, snode, ulong, unsigned long, op_type); metric_info->start_len = atomic_sizes[2]; metric_info->max_len = atomic_sizes[2]; shmem_barrier_all(); - uni_bw(atomic_sizes[2], metric_info, snode, longlong, long long, op_type); + uni_bw(atomic_sizes[2], metric_info, snode, ulonglong, unsigned long long, op_type); } } -void uni_dir_bw(int len, perf_metrics_t *metric_info) +void uni_dir_bw(int len, perf_metrics_t * const metric_info) { bw_set_metric_info_len(metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_bw_get_perf.c b/test/performance/shmem_perf_suite/shmem_bw_get_perf.c index 9c4d832..105558b 100644 --- a/test/performance/shmem_perf_suite/shmem_bw_get_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bw_get_perf.c @@ -33,7 +33,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include @@ -47,7 +46,7 @@ int main(int argc, char *argv[]) } /* end of main() */ void -uni_dir_bw(int len, perf_metrics_t *metric_info) +uni_dir_bw(int len, perf_metrics_t * const metric_info) { uni_bw_get(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c index c4faaee..598da06 100644 --- a/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c @@ -31,7 +31,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #define ENABLE_OPENMP @@ -47,7 +46,7 @@ int main(int argc, char *argv[]) } void -uni_dir_bw(int len, perf_metrics_t *metric_info) +uni_dir_bw(int len, perf_metrics_t * const metric_info) { - uni_bw_ctx(len, metric_info, !streaming_node(*metric_info)); + uni_bw_ctx(len, metric_info, streaming_node(metric_info)); } diff --git a/test/performance/shmem_perf_suite/shmem_bw_put_perf.c b/test/performance/shmem_perf_suite/shmem_bw_put_perf.c index 3cbab8f..ca51571 100644 --- a/test/performance/shmem_perf_suite/shmem_bw_put_perf.c +++ b/test/performance/shmem_perf_suite/shmem_bw_put_perf.c @@ -33,7 +33,6 @@ ** ** -by default megabytes/second results ** -**NOTE: this test assumes correctness of reduction algorithm */ #include #include @@ -46,7 +45,7 @@ int main(int argc, char *argv[]) } void -uni_dir_bw(int len, perf_metrics_t *metric_info) +uni_dir_bw(int len, perf_metrics_t * const metric_info) { uni_bw_put(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c new file mode 100644 index 0000000..6556f20 --- /dev/null +++ b/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +** +** This is a latency test for get. +** +** Features of Test: latency using contexts driven by +** multiple threads. +** +** -in micro seconds +** +*/ + +#define ENABLE_OPENMP + +#include +#include + +int main(int argc, char *argv[]) +{ + latency_main_ctx(argc, argv, STYLE_GET); + + return 0; +} + +void +streaming_latency(int len, perf_metrics_t * const metric_info) +{ + streaming_get_latency_ctx(len, metric_info, streaming_node(metric_info)); +} diff --git a/test/performance/shmem_perf_suite/shmem_latency_get_perf.c b/test/performance/shmem_perf_suite/shmem_latency_get_perf.c index a7ce4a8..523b19f 100644 --- a/test/performance/shmem_perf_suite/shmem_latency_get_perf.c +++ b/test/performance/shmem_perf_suite/shmem_latency_get_perf.c @@ -42,49 +42,65 @@ int main(int argc, char *argv[]) { - latency_main(argc, argv); + latency_main(argc, argv, STYLE_GET); return 0; } /* end of main() */ void -long_element_round_trip_latency(perf_metrics_t data) +long_element_round_trip_latency(perf_metrics_t * const data) { +#ifndef USE_NONBLOCKING_API long_element_round_trip_latency_get(data); +#endif } void -int_element_latency(perf_metrics_t data) +int_element_latency(perf_metrics_t * const data) { +#ifndef USE_NONBLOCKING_API int_g_latency(data); +#endif } void -streaming_latency(int len, perf_metrics_t *data) +streaming_latency(int len, perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; unsigned long int i = 0; - static int print_once = 0; - if(!print_once && data->my_node == GET_IO_NODE) { - printf("\nStreaming results for %d trials each of length %d through %d in"\ - " powers of %d\n", data->trials, data->start_len, - data->max_len, data->inc); - print_results_header(); - print_once++; + int dest = partner_node(metric_info); + int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; } - if (data->my_node == 0) { + shmem_barrier_all(); + if (receiver) { - for (i = 0; i < data->trials + data->warmup; i++) { - if(i == data->warmup) + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - - shmem_getmem(data->dest, data->src, len, 1); +#ifdef USE_NONBLOCKING_API + shmem_getmem_nbi(metric_info->dest, metric_info->src, len, dest); + shmem_quiet(); +#else + shmem_getmem(metric_info->dest, metric_info->src, len, dest); +#endif } end = perf_shmemx_wtime(); - calc_and_print_results(start, end, len, *data); + calc_and_print_results(start, end, len, metric_info); } } /* latency/bw for one-way trip */ diff --git a/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c b/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c deleted file mode 100644 index 7346580..0000000 --- a/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2018 Intel Corporation. All rights reserved. - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* -** -** Notice: micro benchmark ~ two nodes only -** -** Features of Test: -** 1) small get latency test -** 2) getmem latency test to calculate latency of various sizes -** -*/ - -#include - -int main(int argc, char *argv[]) -{ - latency_main(argc, argv); - - return 0; -} /* end of main() */ - -/* NO-OP for non-blocking */ -void -long_element_round_trip_latency(perf_metrics_t data) {} - -void -int_element_latency(perf_metrics_t data) {} - -void -streaming_latency(int len, perf_metrics_t *data) -{ - double start = 0.0; - double end = 0.0; - unsigned long int i = 0; - static int print_once = 0; - if(!print_once && data->my_node == GET_IO_NODE) { - printf("\nStreaming results for %d trials each of length %d through %d in"\ - " powers of %d\n", data->trials, data->start_len, - data->max_len, data->inc); - print_results_header(); - print_once++; - } - - if (data->my_node == 0) { - - for (i = 0; i < data->trials + data->warmup; i++) { - if(i == data->warmup) - start = perf_shmemx_wtime(); - - shmem_getmem_nbi(data->dest, data->src, len, 1); - shmem_quiet(); - } - end = perf_shmemx_wtime(); - - calc_and_print_results(start, end, len, *data); - } -} /* latency/bw for one-way trip */ diff --git a/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c b/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c deleted file mode 100644 index c9f4c3d..0000000 --- a/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Intel Corporation. All rights reserved. - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* -** -** Notice: micro benchmark ~ two nodes only -** -** Features of Test: -** 1) small put pingpong latency test -** 2) one sided latency test to calculate latency of various sizes -** to the network stack -** -*/ - -#include - -int main(int argc, char *argv[]) -{ - latency_main(argc, argv); - - return 0; -} /* end of main() */ - - -/* NO-OP for non-blocking */ -void -long_element_round_trip_latency(perf_metrics_t data) {} - -void -int_element_latency(perf_metrics_t data) {} - -void -streaming_latency(int len, perf_metrics_t *data) -{ - double start = 0.0; - double end = 0.0; - unsigned long int i = 0; - static int print_once = 0; - if(!print_once && data->my_node == PUT_IO_NODE) { - printf("\nStreaming results for %d trials each of length %d through %d in"\ - " powers of %d\n", data->trials, data->start_len, - data->max_len, data->inc); - print_results_header(); - print_once++; - } - - /*puts to zero to match gets validation scheme*/ - if (data->my_node == 1) { - - for (i = 0; i < data->trials + data->warmup; i++) { - if(i == data->warmup) - start = perf_shmemx_wtime(); - - shmem_putmem_nbi(data->dest, data->src, len, 0); - shmem_quiet(); - - } - end = perf_shmemx_wtime(); - - calc_and_print_results(start, end, len, *data); - } -} /* latency/bw for one-way trip */ diff --git a/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c new file mode 100644 index 0000000..ffc6219 --- /dev/null +++ b/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +** +** This is a latency test for put. +** +** Features of Test: latency using contexts driven by +** multiple threads. +** +** -in micro seconds +** +*/ + +#define ENABLE_OPENMP + +#include +#include + +int main(int argc, char *argv[]) +{ + latency_main_ctx(argc, argv, STYLE_PUT); + + return 0; +} + +void +streaming_latency(int len, perf_metrics_t * const metric_info) +{ + streaming_put_latency_ctx(len, metric_info, streaming_node(metric_info)); +} diff --git a/test/performance/shmem_perf_suite/shmem_latency_put_perf.c b/test/performance/shmem_perf_suite/shmem_latency_put_perf.c index 97b2bd2..5cadc21 100644 --- a/test/performance/shmem_perf_suite/shmem_latency_put_perf.c +++ b/test/performance/shmem_perf_suite/shmem_latency_put_perf.c @@ -42,52 +42,67 @@ int main(int argc, char *argv[]) { - latency_main(argc, argv); + latency_main(argc, argv, STYLE_PUT); return 0; } /* end of main() */ void -long_element_round_trip_latency(perf_metrics_t data) +long_element_round_trip_latency(perf_metrics_t * const data) { +#ifndef USE_NONBLOCKING_API long_element_round_trip_latency_put(data); +#endif } void -int_element_latency(perf_metrics_t data) +int_element_latency(perf_metrics_t * const data) { +#ifndef USE_NONBLOCKING_API int_p_latency(data); +#endif } void -streaming_latency(int len, perf_metrics_t *data) +streaming_latency(int len, perf_metrics_t * const metric_info) { double start = 0.0; double end = 0.0; unsigned long int i = 0; - static int print_once = 0; - if(!print_once && data->my_node == PUT_IO_NODE) { - printf("\nStreaming results for %d trials each of length %d through %d in"\ - " powers of %d\n", data->trials, data->start_len, - data->max_len, data->inc); - print_results_header(); - print_once++; + int dest = partner_node(metric_info); + int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; + static int check_once = 0; + + if (!check_once) { + /* check to see whether sender and receiver are the same process */ + if (dest == metric_info->my_node) { + fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", + dest); + } + /* hostname validation for all sender and receiver processes */ + int status = check_hostname_validation(metric_info); + if (status != 0) return; + check_once++; } + shmem_barrier_all(); /*puts to zero to match gets validation scheme*/ - if (data->my_node == 1) { - - for (i = 0; i < data->trials + data->warmup; i++) { - if(i == data->warmup) + if (sender) { + for (i = 0; i < metric_info->trials + metric_info->warmup; i++) { + if(i == metric_info->warmup) start = perf_shmemx_wtime(); - shmem_putmem(data->dest, data->src, len, 0); +#ifdef USE_NONBLOCKING_API + shmem_putmem_nbi(metric_info->dest, metric_info->src, len, dest); +#else + shmem_putmem(metric_info->dest, metric_info->src, len, dest); +#endif shmem_quiet(); } end = perf_shmemx_wtime(); - calc_and_print_results(start, end, len, *data); + calc_and_print_results(start, end, len, metric_info); } } /* latency/bw for one-way trip */ diff --git a/test/performance/shmem_perf_suite/target_put.h b/test/performance/shmem_perf_suite/target_put.h index 223a457..cac4b62 100644 --- a/test/performance/shmem_perf_suite/target_put.h +++ b/test/performance/shmem_perf_suite/target_put.h @@ -25,37 +25,49 @@ * SOFTWARE. */ -int static inline get_size_of_side(perf_metrics_t my_info) { - if(my_info.my_node < my_info.midpt) - return my_info.szinitiator; +static inline int get_size_of_side(const perf_metrics_t * const my_info) { + if(my_info->my_node < my_info->midpt) + return my_info->szinitiator; else - return my_info.sztarget; + return my_info->sztarget; } -int static inline get_num_partners(perf_metrics_t my_info) { - int unused_PEs = 0, num_partners = 0, num_xtra_partners = 0; +static inline int get_size_of_other_side(const perf_metrics_t * const my_info) { + if(my_info->my_node < my_info->midpt) + return my_info->sztarget; + else + return my_info->szinitiator; +} + +static inline int get_num_partners(perf_metrics_t * const my_info, int snode) { + int unused_PEs = 0, num_partners = 0; int active_PEs = get_size_of_side(my_info); + int other_side = get_size_of_other_side(my_info); - if(active_PEs == my_info.midpt) + if(active_PEs >= other_side) return 1; - unused_PEs = my_info.midpt - active_PEs; - num_partners = my_info.midpt / active_PEs; - num_xtra_partners = unused_PEs % active_PEs; + num_partners = other_side / active_PEs; + unused_PEs = other_side % active_PEs; - if((my_info.my_node % my_info.midpt) < num_xtra_partners) - num_partners++; + if (snode) { + if((my_info->my_node % active_PEs) < unused_PEs) + num_partners++; + } else { + if(((my_info->my_node - my_info->midpt) % active_PEs) < unused_PEs) + num_partners++; + } return num_partners; } /* target only needs to know num of partners */ -int static inline *get_initiators_partners(perf_metrics_t my_info, int num_partners) { - int node_to_shadow = my_info.my_node; +static inline int *get_initiators_partners(const perf_metrics_t * const my_info, int num_partners) { + int node_to_shadow = my_info->my_node; int i = 0; int *partner_nodes = NULL; - assert(my_info.cstyle == COMM_PAIRWISE && !target_node(my_info)); + assert(my_info->cstyle == COMM_PAIRWISE && !target_node(my_info)); if(num_partners < 1) return partner_nodes; @@ -63,37 +75,40 @@ int static inline *get_initiators_partners(perf_metrics_t my_info, int num_partn assert(partner_nodes); for(i = 0; i < num_partners; i++) { - partner_nodes[i] = ((node_to_shadow % my_info.sztarget) + my_info.midpt); - node_to_shadow += my_info.szinitiator; + partner_nodes[i] = ((node_to_shadow % my_info->sztarget) + my_info->midpt); + node_to_shadow += my_info->szinitiator; } return partner_nodes; } -void static inline target_data_uni_bw(int len, perf_metrics_t metric_info) +static inline void target_data_uni_bw(int len, perf_metrics_t * const metric_info) { double start = 0.0, end = 0.0; int i = 0; - unsigned long int j = 0; - int snode = (metric_info.num_pes != 1)? streaming_node(metric_info) : true; - int num_partners = get_num_partners(metric_info); + unsigned long int j, k; + int snode = (metric_info->num_pes != 1)? streaming_node(metric_info) : true; + int num_partners = get_num_partners(metric_info, snode); static int completion_signal = 0; int *my_PE_partners = (snode ? get_initiators_partners(metric_info, num_partners): NULL); + metric_info->num_partners = num_partners; shmem_barrier_all(); if (target_node(metric_info)) { shmem_int_wait_until(&completion_signal, SHMEM_CMP_EQ, num_partners); } else if (snode) { for (i = 0; i < num_partners; i++) { - for(j = 0; j < metric_info.warmup; j++) { + for(j = 0; j < metric_info->warmup; j++) { + for(k = 0; k < metric_info->window_size; k++) { #ifdef USE_NONBLOCKING_API - shmem_putmem_nbi(metric_info.dest, metric_info.src, len, my_PE_partners[i]); + shmem_putmem_nbi(metric_info->dest, metric_info->src, len, my_PE_partners[i]); #else - shmem_putmem(metric_info.dest, metric_info.src, len, my_PE_partners[i]); + shmem_putmem(metric_info->dest, metric_info->src, len, my_PE_partners[i]); #endif + } + shmem_quiet(); } - shmem_quiet(); shmem_int_atomic_inc(&completion_signal, my_PE_partners[i]); } } @@ -106,35 +121,29 @@ void static inline target_data_uni_bw(int len, perf_metrics_t metric_info) shmem_int_wait_until(&completion_signal, SHMEM_CMP_EQ, num_partners); } else if (snode) { for (i = 0; i < num_partners; i++) { - for(j = 0; j < metric_info.trials; j++) { + for(j = 0; j < metric_info->trials; j++) { + for(k = 0; k < metric_info->window_size; k++) { #ifdef USE_NONBLOCKING_API - shmem_putmem_nbi(metric_info.dest, metric_info.src, len, my_PE_partners[i]); + shmem_putmem_nbi(metric_info->dest, metric_info->src, len, my_PE_partners[i]); #else - shmem_putmem(metric_info.dest, metric_info.src, len, my_PE_partners[i]); + shmem_putmem(metric_info->dest, metric_info->src, len, my_PE_partners[i]); #endif + } + shmem_quiet(); } - shmem_quiet(); shmem_int_atomic_inc(&completion_signal, my_PE_partners[i]); } } - shmem_barrier_all(); if (snode || target_node(metric_info)) { end = perf_shmemx_wtime(); calc_and_print_results(end, start, len, metric_info); } + completion_signal = 0; free(my_PE_partners); } -void static inline target_bw_itr(int len, perf_metrics_t *metric_info) +static inline void target_bw_itr(int len, perf_metrics_t * const metric_info) { - target_data_uni_bw(len, *metric_info); - - metric_info->start_len = TARGET_SZ_MAX; - len = TARGET_SZ_MAX; - - target_data_uni_bw(len, *metric_info); - - /* stopping upper layer from iterating, we are done */ - metric_info->max_len = TARGET_SZ_MIN; + target_data_uni_bw(len, metric_info); } diff --git a/test/performance/shmem_perf_suite/uni_dir.h b/test/performance/shmem_perf_suite/uni_dir.h index c47aef5..d238077 100644 --- a/test/performance/shmem_perf_suite/uni_dir.h +++ b/test/performance/shmem_perf_suite/uni_dir.h @@ -26,15 +26,20 @@ */ #include -void static inline uni_bw_put(int len, perf_metrics_t *metric_info) +static inline void uni_bw_put(int len, perf_metrics_t *metric_info) { double start = 0.0, end = 0.0; unsigned long int i = 0, j = 0; - int dest = partner_node(*metric_info); - int snode = (metric_info->num_pes != 1)? streaming_node(*metric_info) : true; + int dest = partner_node(metric_info); + int snode = (metric_info->num_pes != 1)? streaming_node(metric_info) : true; static int check_once = 0; static int fin = -1; + if(metric_info->target_data) { + target_bw_itr(len, metric_info); + return; + } + if (!check_once) { /* check to see whether sender and receiver are the same process */ if (dest == metric_info->my_node) { @@ -42,16 +47,11 @@ void static inline uni_bw_put(int len, perf_metrics_t *metric_info) dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } - if(metric_info->target_data) { - target_bw_itr(len, metric_info); - return; - } - shmem_barrier_all(); if (snode) { @@ -83,22 +83,27 @@ void static inline uni_bw_put(int len, perf_metrics_t *metric_info) shmem_int_p(&fin, 1, dest); shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0); end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } else { shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1); shmem_int_p(&fin, 0, dest); } } -void static inline uni_bw_get(int len, perf_metrics_t *metric_info) +static inline void uni_bw_get(int len, perf_metrics_t *metric_info) { double start = 0.0, end = 0.0; unsigned long int i = 0, j = 0; - int dest = partner_node(*metric_info); - int snode = (metric_info->num_pes != 1)? streaming_node(*metric_info) : true; + int dest = partner_node(metric_info); + int snode = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true; static int check_once = 0; static int fin = -1; + if(metric_info->target_data) { + target_bw_itr(len, metric_info); + return; + } + if (!check_once) { /* check to see whether sender and receiver are the same process */ if (dest == metric_info->my_node) { @@ -106,16 +111,11 @@ void static inline uni_bw_get(int len, perf_metrics_t *metric_info) dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } - if(metric_info->target_data) { - target_bw_itr(len, metric_info); - return; - } - shmem_barrier_all(); if (snode) { @@ -154,7 +154,7 @@ void static inline uni_bw_get(int len, perf_metrics_t *metric_info) shmem_int_p(&fin, 1, dest); shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0); end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } else { shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1); shmem_int_p(&fin, 0, dest); diff --git a/test/performance/shmem_perf_suite/uni_dir_ctx.h b/test/performance/shmem_perf_suite/uni_dir_ctx.h index fd46960..211906f 100644 --- a/test/performance/shmem_perf_suite/uni_dir_ctx.h +++ b/test/performance/shmem_perf_suite/uni_dir_ctx.h @@ -25,26 +25,22 @@ * SOFTWARE. */ - -void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, - int streaming_node) +static inline +void uni_bw_ctx(int len, perf_metrics_t *metric_info, int streaming_node) { double start = 0.0, end = 0.0; - int j = 0; - int dest = partner_node(*metric_info); - char *src = aligned_buffer_alloc(metric_info->nthreads * len); - char *dst = aligned_buffer_alloc(metric_info->nthreads * len); - assert(src && dst); + unsigned long int i, j; + int dest = partner_node(metric_info); static int check_once = 0; if (!check_once) { /* check to see whether sender and receiver are the same process */ if (dest == metric_info->my_node) { - fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", - dest); + fprintf(stderr, "Warning: Sender and receiver are the same " + "process (%d)\n", dest); } /* hostname validation for all sender and receiver processes */ - int status = check_hostname_validation(*metric_info); + int status = check_hostname_validation(metric_info); if (status != 0) return; check_once++; } @@ -52,10 +48,9 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, shmem_barrier_all(); if (streaming_node) { -#pragma omp parallel default(none) firstprivate(len, dest) private(j) \ - shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads) +#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) { - int i; const int thread_id = omp_get_thread_num(); shmem_ctx_t ctx; shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); @@ -63,9 +58,11 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, for (i = 0; i < metric_info->warmup; i++) { for (j = 0; j < metric_info->window_size; j++) { #ifdef USE_NONBLOCKING_API - shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #else - shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #endif } shmem_ctx_quiet(ctx); @@ -76,10 +73,9 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, shmem_barrier_all(); if (streaming_node) { -#pragma omp parallel default(none) firstprivate(len, dest) private(j) \ - shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads) +#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \ +shared(metric_info, start, end) num_threads(metric_info->nthreads) { - int i; const int thread_id = omp_get_thread_num(); shmem_ctx_t ctx; shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); @@ -93,9 +89,11 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, for (i = 0; i < metric_info->trials; i++) { for (j = 0; j < metric_info->window_size; j++) { #ifdef USE_NONBLOCKING_API - shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #else - shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest); + shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, + metric_info->src + thread_id * len, len, dest); #endif } shmem_ctx_quiet(ctx); @@ -107,12 +105,8 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info, shmem_barrier_all(); if (streaming_node) { end = perf_shmemx_wtime(); - calc_and_print_results(end, start, len, *metric_info); + calc_and_print_results(end, start, len, metric_info); } shmem_barrier_all(); - - aligned_buffer_free(src); - aligned_buffer_free(dst); - } diff --git a/test/shmemx/Makefile.am b/test/shmemx/Makefile.am index 091a7b9..2133897 100644 --- a/test/shmemx/Makefile.am +++ b/test/shmemx/Makefile.am @@ -26,7 +26,8 @@ endif if HAVE_PTHREADS if SHMEMX_TESTS check_PROGRAMS += \ - gettid_register + gettid_register \ + perf_counter endif endif diff --git a/test/shmemx/gettid_register.c b/test/shmemx/gettid_register.c index 2e7c82d..4697dfd 100644 --- a/test/shmemx/gettid_register.c +++ b/test/shmemx/gettid_register.c @@ -46,7 +46,13 @@ pthread_key_t key; static uint64_t my_gettid(void) { uint64_t tid_val = 0; - tid_val = * (uint64_t*) pthread_getspecific(key); + void* ret = pthread_getspecific(key); + if (ret != NULL) + tid_val = * (uint64_t*) ret; + else { + printf("Calling pthread_getspecific(key) returned NULL\n"); + shmem_global_exit(3); + } return tid_val; } diff --git a/test/shmemx/perf_counter.c b/test/shmemx/perf_counter.c new file mode 100644 index 0000000..e6ee36f --- /dev/null +++ b/test/shmemx/perf_counter.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Single-threaded test for validation of performance counter APIs +*/ + +#include +#include +#include +#include + +#define ITER 100 +#define WINDOW 64 +#define LENGTH 1024 + +int me, npes; +char *src_array, *dest_array; +uint64_t c_put, c_get, p_put, p_get, target; + +static void collect(shmem_ctx_t ctx) { + shmemx_pcntr_get_completed_write(ctx, &c_put); + shmemx_pcntr_get_completed_read(ctx, &c_get); + shmemx_pcntr_get_completed_target(&target); + shmemx_pcntr_get_issued_write(ctx, &p_put); + shmemx_pcntr_get_issued_read(ctx, &p_get); +} + +static void put_and_progress_check(void) { + int i, j; + int partner = ((npes % 2 == 0) ? (me % 2 == 0 ? me + 1 : me - 1) : + (me % 2 != 0 ? me - 1 : + (me == npes - 1) ? me : me + 1)); + + shmem_ctx_t ctx; + shmemx_pcntr_t pcntr; + shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx); + + for (i = 0; i < ITER; i++) { + for (j = 0; j < WINDOW; j++) { + shmem_ctx_putmem_nbi(ctx, dest_array, src_array, LENGTH, partner); + collect(ctx); + } + shmem_ctx_quiet(ctx); + } + + shmemx_pcntr_get_all(ctx, &pcntr); + shmem_ctx_destroy(ctx); + + /* Report the counter values observed through get_all API after the loop + * completion. Except the target counter, other counter values should + * reflect the final expected value */ + printf("Value observed of the performance counters from combined API: \n" + "Completed Put = %10"PRIu64"\n" + "Completed Get = %10"PRIu64"\n" + "Issued Put = %10"PRIu64"\n" + "Issued Get = %10"PRIu64"\n" + "Target = %10"PRIu64"\n" + , pcntr.completed_put, pcntr.completed_get, pcntr.pending_put, + pcntr.pending_get, pcntr.target); + + return; +} + +int main(int argc, char **argv) { + + shmem_init(); + + me = shmem_my_pe(); + npes = shmem_n_pes(); + + src_array = shmem_malloc(LENGTH); + dest_array = shmem_malloc(LENGTH); + + if (me == 0) { + printf("Performance counter API test with %d PEs\n", npes); + } + + put_and_progress_check(); + shmem_barrier_all(); + + /* Report the counter values observed through single parameter APIs in + * the final iteration. The values reported here may be less than the actual + * final value as they are captured before the barrier one counter at a time + * */ + printf("Final value observed of the performance counters from individual APIs: \n" + "Completed Put = %10"PRIu64"\n" + "Completed Get = %10"PRIu64"\n" + "Issued Put = %10"PRIu64"\n" + "Issued Get = %10"PRIu64"\n" + "Target = %10"PRIu64"\n" + , c_put, c_get, p_put, p_get, target); + + shmem_free(dest_array); + shmem_free(src_array); + + shmem_finalize(); + return 0; +} diff --git a/test/unit/Makefile.am b/test/unit/Makefile.am index d7a7580..7649c6d 100644 --- a/test/unit/Makefile.am +++ b/test/unit/Makefile.am @@ -61,7 +61,6 @@ check_PROGRAMS = \ lfinc \ shmem_info \ query_thread \ - global_exit \ asym_alloc \ set_fetch \ alltoall \ @@ -99,6 +98,12 @@ check_PROGRAMS = \ many-ctx \ shmem_test +# Temporarily disabled: Global exit test tends to fail with MPI-PMI +if !USE_PMI_MPI +check_PROGRAMS += \ + global_exit +endif + if ENABLE_PROFILING check_PROGRAMS += \ rma_coverage_pshmem @@ -182,9 +187,9 @@ rma_coverage_pshmem_CFLAGS = -DTEST_PSHMEM query_thread_funneled_SOURCES = query_thread.c query_thread_funneled_CFLAGS = -DENABLE_THREADS -mt_a2a_SOURCES = mt_a2a.c pthread_barrier.h +mt_a2a_SOURCES = mt_a2a.c mt_a2a_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) -mt_a2a_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS) +mt_a2a_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS) mt_a2a_LDADD = $(LDADD) $(PTHREAD_CFLAGS) mt_contention_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) @@ -197,11 +202,11 @@ mt_lock_test_CFLAGS = $(PTHREAD_CFLAGS) mt_lock_test_LDADD = $(LDADD) $(PTHREAD_CFLAGS) mt_membar_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) -mt_membar_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS) +mt_membar_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS) mt_membar_LDADD = $(LDADD) $(PTHREAD_CFLAGS) threading_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) -threading_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS) +threading_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS) threading_LDADD = $(LDADD) $(PTHREAD_CFLAGS) web_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS) diff --git a/test/unit/reduce_active_set.c b/test/unit/reduce_active_set.c index b307342..1a05788 100644 --- a/test/unit/reduce_active_set.c +++ b/test/unit/reduce_active_set.c @@ -58,7 +58,7 @@ int main(void) for (i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) { max_psync[i] = SHMEM_SYNC_VALUE; - max_psync[i] = SHMEM_SYNC_VALUE; + min_psync[i] = SHMEM_SYNC_VALUE; } if (me == 0)