diff --git a/configure.ac b/configure.ac
index 16b31d5..3bf34c5 100755
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@
 
 dnl Init Autoconf/Automake/Libtool
 
-AC_INIT([Sandia OpenSHMEM Test Suite], [1.4.1], [https://github.com/Sandia-OpenSHMEM/SOS])
+AC_INIT([Sandia OpenSHMEM Test Suite], [1.4.2], [https://github.com/Sandia-OpenSHMEM/SOS])
 AC_PREREQ([2.60])
 AC_CONFIG_AUX_DIR([config])
 AC_CONFIG_MACRO_DIR([config])
@@ -76,7 +76,7 @@ AM_CONDITIONAL([HAVE_OPENMP], [test "$enable_threads" != "no" -a "$enable_openmp
 
 AC_ARG_ENABLE([lengthy-tests],
     [AC_HELP_STRING([--enable-lengthy-tests],
-                    [Enable long running tests in the test suite (default: disabled)])])
+                    [Execute long running tests as part of "make check" (default: disabled)])])
 AM_CONDITIONAL([ENABLE_LENGTHY_TESTS], [test "$enable_lengthy_tests" = "yes"])
 
 AC_ARG_ENABLE([fortran],
@@ -185,6 +185,7 @@ AM_CONDITIONAL([HAVE_CXX], [test "$enable_cxx" != "no" ])
 AM_CONDITIONAL([ENABLE_PROFILING], [test "$enable_profiling" = "yes" ])
 
 dnl make tests work in standalone mode
+AM_CONDITIONAL([USE_PMI_MPI], [false])
 AM_CONDITIONAL([USE_PMI_SIMPLE], [false])
 AM_CONDITIONAL([USE_PORTALS4], [false])
 AM_CONDITIONAL([HAVE_LONG_FORTRAN_HEADER], [false])
diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am
index 7c8e91b..22dd555 100644
--- a/test/apps/Makefile.am
+++ b/test/apps/Makefile.am
@@ -54,5 +54,5 @@ LDADD += $(top_builddir)/pmi-simple/libpmi_simple.la
 endif
 
 mandelbrot_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
-mandelbrot_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS)
+mandelbrot_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS)
 mandelbrot_LDADD = $(LDADD) $(PTHREAD_CFLAGS)
diff --git a/test/apps/gups.c b/test/apps/gups.c
index f7fc03e..7ed085f 100644
--- a/test/apps/gups.c
+++ b/test/apps/gups.c
@@ -178,7 +178,7 @@
 #define ZERO64B 0LL
 
 uint64_t TotalMemOpt = 8192;
-int NumUpdatesOpt = 0; /* FIXME: This option is ignored */
+uint64_t NumUpdatesOpt = 0;
 double SHMEMGUPs;
 double SHMEMRandomAccess_ErrorsFraction;
 double SHMEMRandomAccess_time;
@@ -324,9 +324,7 @@ SHMEMRandomAccess(void)
   double TotalMem;
   static int sAbort, rAbort;
 
-  uint64_t NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */
-  uint64_t NumUpdates;  /* actual number of updates to table - may be smaller than
-                       * NumUpdates_Default due to execution time bounds */
+  uint64_t NumUpdates; /* total number of updates to table */
   uint64_t ProcNumUpdates; /* number of updates per processor */
 
   static long pSync_bcast[SHMEM_BCAST_SYNC_SIZE];
@@ -421,9 +419,13 @@ SHMEMRandomAccess(void)
       HPCC_PELock[i] = 0;
 
   /* Default number of global updates to table: 4x number of table entries */
-  NumUpdates_Default = 4 * TableSize;
-  ProcNumUpdates = 4 * LocalTableSize;
-  NumUpdates = NumUpdates_Default;
+  if (NumUpdatesOpt == 0) {
+     ProcNumUpdates = 4 * LocalTableSize;
+     NumUpdates = 4 * TableSize;
+  } else {
+     ProcNumUpdates = NumUpdatesOpt;
+     NumUpdates = NumUpdatesOpt * NumProcs;
+  }
 
   if (MyProc == 0) {
     fprintf( outFile, "Running on %d processors\n", NumProcs);
@@ -432,7 +434,7 @@ SHMEMRandomAccess(void)
     fprintf( outFile, "PE Main table size = (2^%" PRIu64 ")/%d  = %" PRIu64 " words/PE MAX\n",
              logTableSize, NumProcs, LocalTableSize);
 
-    fprintf( outFile, "Default number of updates (RECOMMENDED) = %" PRIu64 "\n", NumUpdates_Default);
+    fprintf( outFile, "Total number of updates = %" PRIu64 "\n", NumUpdates);
   }
 
   /* Initialize main table */
diff --git a/test/apps/mandelbrot.c b/test/apps/mandelbrot.c
index 0c2c112..09a98f7 100644
--- a/test/apps/mandelbrot.c
+++ b/test/apps/mandelbrot.c
@@ -193,8 +193,8 @@ static void *thread_worker(void *arg) {
     // Malloc local (non-symmetric) buffers
     pixels[0] = malloc(sizeof(int)*job_points);
     pixels[1] = malloc(sizeof(int)*job_points);
-    pe_mask = malloc(sizeof(int)*npes);
-    pe_ct_max = malloc(sizeof(int)*npes);
+    pe_mask = calloc(npes, sizeof(int));
+    pe_ct_max = calloc(npes, sizeof(int));
 
     if (NULL == pixels[0] || NULL == pixels[1] || NULL == pe_mask || NULL == pe_ct_max) {
         printf("%d, %d: Error, thread malloc failed\n", me, tid);
diff --git a/test/include/Makefile.am b/test/include/Makefile.am
index e9d484e..bf93767 100644
--- a/test/include/Makefile.am
+++ b/test/include/Makefile.am
@@ -12,4 +12,5 @@
 # distribution.
 
 noinst_HEADERS = \
-	uthash.h
+	uthash.h \
+	pthread_barrier.h
diff --git a/test/unit/pthread_barrier.h b/test/include/pthread_barrier.h
similarity index 100%
rename from test/unit/pthread_barrier.h
rename to test/include/pthread_barrier.h
diff --git a/test/performance/shmem_perf_suite/Makefile.am b/test/performance/shmem_perf_suite/Makefile.am
index 104a67e..ef9134f 100644
--- a/test/performance/shmem_perf_suite/Makefile.am
+++ b/test/performance/shmem_perf_suite/Makefile.am
@@ -9,7 +9,9 @@
 
 check_PROGRAMS = \
 	shmem_latency_put_perf \
+	shmem_latency_put_perf_nb \
 	shmem_latency_get_perf \
+	shmem_latency_get_perf_nb \
 	shmem_bw_put_perf \
 	shmem_bw_put_perf_nb \
 	shmem_bibw_put_perf \
@@ -18,8 +20,6 @@ check_PROGRAMS = \
 	shmem_bw_get_perf_nb \
 	shmem_bibw_get_perf \
 	shmem_bibw_get_perf_nb \
-	shmem_latency_nb_put_perf \
-	shmem_latency_nb_get_perf \
 	shmem_bw_atomics_perf \
 	shmem_bibw_atomics_perf
 
@@ -28,7 +28,11 @@ check_PROGRAMS += \
 	shmem_bw_put_ctx_perf \
 	shmem_bw_put_ctx_perf_nb \
 	shmem_bibw_put_ctx_perf \
-	shmem_bibw_put_ctx_perf_nb
+	shmem_bibw_put_ctx_perf_nb \
+	shmem_latency_put_ctx_perf \
+	shmem_latency_put_ctx_perf_nb \
+	shmem_latency_get_ctx_perf \
+	shmem_latency_get_ctx_perf_nb 
 endif
 
 noinst_HEADERS = \
@@ -41,7 +45,8 @@ noinst_HEADERS = \
 	uni_dir_ctx.h \
 	bi_dir.h \
 	bi_dir_ctx.h \
-	target_put.h
+	target_put.h \
+	latency_ctx.h
 
 if ENABLE_LENGTHY_TESTS
 TESTS = $(check_PROGRAMS)
@@ -65,6 +70,12 @@ if USE_PMI_SIMPLE
 LDADD += $(top_builddir)/pmi-simple/libpmi_simple.la
 endif
 
+shmem_latency_put_perf_nb_SOURCES = shmem_latency_put_perf.c
+shmem_latency_put_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API
+
+shmem_latency_get_perf_nb_SOURCES = shmem_latency_get_perf.c
+shmem_latency_get_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API
+
 shmem_bw_put_perf_nb_SOURCES = shmem_bw_put_perf.c
 shmem_bw_put_perf_nb_CFLAGS = -DUSE_NONBLOCKING_API
 
@@ -86,3 +97,15 @@ shmem_bibw_put_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS)
 
 shmem_bibw_put_ctx_perf_nb_SOURCES = shmem_bibw_put_ctx_perf.c
 shmem_bibw_put_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API
+
+shmem_latency_put_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS)
+
+shmem_latency_put_ctx_perf_nb_SOURCES = shmem_latency_put_ctx_perf.c
+shmem_latency_put_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API
+
+shmem_latency_get_ctx_perf_CFLAGS = $(AM_OPENMP_CFLAGS)
+
+shmem_latency_get_ctx_perf_nb_SOURCES = shmem_latency_get_ctx_perf.c
+shmem_latency_get_ctx_perf_nb_CFLAGS = $(AM_OPENMP_CFLAGS) -DUSE_NONBLOCKING_API
+
+AM_CPPFLAGS += -DENABLE_THREADS
diff --git a/test/performance/shmem_perf_suite/bi_dir.h b/test/performance/shmem_perf_suite/bi_dir.h
index d46437e..d857dfe 100644
--- a/test/performance/shmem_perf_suite/bi_dir.h
+++ b/test/performance/shmem_perf_suite/bi_dir.h
@@ -25,10 +25,10 @@
  * SOFTWARE.
  */
 
-void static inline bi_bw_put(int len, perf_metrics_t *metric_info)
+static inline void bi_bw_put(int len, perf_metrics_t * const metric_info)
 {
     double start = 0.0, end = 0.0;
-    int dest = partner_node(*metric_info);
+    int dest = partner_node(metric_info);
     unsigned long int i = 0, j = 0;
     static int check_once = 0;
     static int fin = -1;
@@ -40,7 +40,7 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info)
                              dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
@@ -59,7 +59,7 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info)
     }
 
     shmem_barrier_all();
-    if (streaming_node(*metric_info)) {
+    if (streaming_node(metric_info)) {
         start = perf_shmemx_wtime();
     }
 
@@ -74,11 +74,11 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info)
         shmem_quiet();
     }
 
-    if (streaming_node(*metric_info)) {
+    if (streaming_node(metric_info)) {
         shmem_int_p(&fin, 1, dest);
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0);
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     } else {
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1);
         shmem_int_p(&fin, 0, dest);
@@ -86,10 +86,10 @@ void static inline bi_bw_put(int len, perf_metrics_t *metric_info)
 
 }
 
-void static inline bi_bw_get(int len, perf_metrics_t *metric_info)
+static inline void bi_bw_get(int len, perf_metrics_t * const metric_info)
 {
     double start = 0.0, end = 0.0;
-    int dest = partner_node(*metric_info);
+    int dest = partner_node(metric_info);
     unsigned long int i = 0, j = 0;
     static int check_once = 0;
     static int fin = -1;
@@ -101,7 +101,7 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info)
                              dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
@@ -125,7 +125,7 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info)
     }
 
     shmem_barrier_all();
-    if (streaming_node(*metric_info)) {
+    if (streaming_node(metric_info)) {
         start = perf_shmemx_wtime();
     }
 
@@ -145,11 +145,11 @@ void static inline bi_bw_get(int len, perf_metrics_t *metric_info)
 #endif
     } 
 
-    if (streaming_node(*metric_info)) {
+    if (streaming_node(metric_info)) {
         shmem_int_p(&fin, 1, dest);
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0);
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     } else {
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1);
         shmem_int_p(&fin, 0, dest);
diff --git a/test/performance/shmem_perf_suite/bi_dir_ctx.h b/test/performance/shmem_perf_suite/bi_dir_ctx.h
index 8b9fe10..07003c6 100644
--- a/test/performance/shmem_perf_suite/bi_dir_ctx.h
+++ b/test/performance/shmem_perf_suite/bi_dir_ctx.h
@@ -26,34 +26,30 @@
 */
 
 
-void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
+static inline void bi_bw_ctx (int len, perf_metrics_t *metric_info)
 {
     double start = 0.0, end = 0.0;
-    int dest = partner_node(*metric_info);
-    int j = 0;
-    char *src = aligned_buffer_alloc(metric_info->nthreads * len);
-    char *dst = aligned_buffer_alloc(metric_info->nthreads * len);
-    assert(src && dst);
+    int dest = partner_node(metric_info);
+    unsigned long int i, j;
     static int check_once = 0;
 
     if (!check_once) {
         /* check to see whether sender and receiver are the same process */
         if (dest == metric_info->my_node) {
-            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", 
-                             dest);
+            fprintf(stderr, "Warning: Sender and receiver are the same "
+                            "process (%d)\n", dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
 
     shmem_barrier_all();
 
-#pragma omp parallel default(none) firstprivate(len, dest) private(j) \
-    shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads)
+#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
     {
-        int i;
         const int thread_id = omp_get_thread_num();
         shmem_ctx_t ctx;
         shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
@@ -61,9 +57,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
         for (i = 0; i < metric_info->warmup; i++) {
             for(j = 0; j < metric_info->window_size; j++) {
 #ifdef USE_NONBLOCKING_API
-                shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
 #else
-                shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                 metric_info->src + thread_id * len, len, dest);
 #endif
             }
             shmem_ctx_quiet(ctx);
@@ -72,11 +70,10 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
     }
 
     shmem_barrier_all();
-    if (streaming_node(*metric_info)) {
-#pragma omp parallel default(none) firstprivate(len, dest) private(j) \
-        shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads)
+    if (streaming_node(metric_info)) {
+#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
         {
-            int i;
             const int thread_id = omp_get_thread_num();
             shmem_ctx_t ctx;
             shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
@@ -89,9 +86,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
             for (i = 0; i < metric_info->trials; i++) {
                 for(j = 0; j < metric_info->window_size; j++) {
 #ifdef USE_NONBLOCKING_API
-                    shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                         metric_info->src + thread_id * len, len, dest);
 #else
-                    shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
 #endif
                 }
                 shmem_ctx_quiet(ctx);
@@ -99,10 +98,9 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
             shmem_ctx_destroy(ctx);
         }
     } else {
-#pragma omp parallel default(none) firstprivate(len, dest) private(j) \
-        shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads)
+#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
         {
-            int i;
             const int thread_id = omp_get_thread_num();
             shmem_ctx_t ctx;
             shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
@@ -110,9 +108,11 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
             for (i = 0; i < metric_info->trials; i++) {
                 for(j = 0; j < metric_info->window_size; j++) {
 #ifdef USE_NONBLOCKING_API
-                    shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                         metric_info->src + thread_id * len, len, dest);
 #else
-                    shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
 #endif
                 }
                 shmem_ctx_quiet(ctx);
@@ -122,14 +122,10 @@ void static inline bi_bw_ctx (int len, perf_metrics_t *metric_info)
     }
 
     shmem_barrier_all();
-    if (streaming_node(*metric_info)) {
+    if (streaming_node(metric_info)) {
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     }
 
     shmem_barrier_all();
-
-    aligned_buffer_free(src);
-    aligned_buffer_free(dst);
-
 }
diff --git a/test/performance/shmem_perf_suite/bw_common.h b/test/performance/shmem_perf_suite/bw_common.h
index 847c6ec..dfc4b32 100644
--- a/test/performance/shmem_perf_suite/bw_common.h
+++ b/test/performance/shmem_perf_suite/bw_common.h
@@ -31,431 +31,53 @@
 #include <omp.h>
 #endif
 
-#define MAX_MSG_SIZE (1<<23)
-#define START_LEN 1
-
-#define INC 2
-#define TRIALS 500
-#define WINDOW_SIZE 64
-#define WARMUP 50
-
-#define TRIALS_LARGE  100
-#define WINDOW_SIZE_LARGE 64
-#define WARMUP_LARGE  10
-#define LARGE_MESSAGE_SIZE  8192
-
-#define TARGET_SZ_MIN 8
-#define TARGET_SZ_MAX 4096
-
-/*atomics common */
-#define ATOMICS_N_DTs 3
-/*note: ignoring cswap/swap for now in verification */
-#define ATOMICS_N_OPs 4
-/*PE 0 is printing its latency, thus have it not be the INCAST PE*/
-#define INCAST_PE 1
-
-typedef enum {
-    UNI_DIR,
-    BI_DIR,
-} bw_type;
-
-typedef enum {
-    STYLE_PUT,
-    STYLE_GET,
-    STYLE_RMA,
-    STYLE_ATOMIC
-} bw_style;
-
-typedef enum {
-    FIRST_HALF,
-    SECOND_HALF,
-    FULL_SET
-} red_PE_set;
-
-typedef enum {
-    COMM_PAIRWISE,
-    COMM_INCAST
-} comm_style;
-
-typedef enum {
-    B,
-    KB,
-    MB
-} bw_units;
-
-typedef struct perf_metrics {
-    unsigned long int start_len, max_len;
-    unsigned long int size_inc, trials;
-    unsigned long int window_size, warmup;
-    int validate;
-    int target_data;
-    int my_node, num_pes, sztarget, szinitiator, midpt;
-    bw_units unit;
-    char *src, *dest;
-    const char *bw_type;
-    bw_type type;
-    comm_style cstyle;
-    bw_style bwstyle;
-    int thread_safety;
-    int nthreads;
-    int individual_report;
-} perf_metrics_t;
-
-long red_psync[SHMEM_REDUCE_SYNC_SIZE];
-long bar_psync[SHMEM_BARRIER_SYNC_SIZE];
+static const char * dt_names [] = { "uint", "ulong", "ulonglong" };
 
 /*default settings if no input is provided */
-void static data_set_defaults(perf_metrics_t * data) {
-    data->start_len = START_LEN;
-    data->max_len = MAX_MSG_SIZE;
-    data->size_inc = INC;
-    data->trials = TRIALS;
-    data->window_size = WINDOW_SIZE; /*back-to-back msg stream*/
-    data->warmup = WARMUP; /*number of initial iterations to skip*/
-    data->unit = MB;
-    data->validate = false;
-    data->target_data = false;
-    data->my_node = -1;
-    data->num_pes = -1;
-    data->midpt = -1;
-    data->sztarget = -1;
-    data->szinitiator = -1;
-    data->src = NULL;
-    data->dest = NULL;
-    data->cstyle = COMM_PAIRWISE;
-    data->bwstyle = STYLE_RMA;
-    data->thread_safety = SHMEM_THREAD_SINGLE;
-    data->nthreads = 1;
-    data->individual_report = -1;
+static 
+void init_metrics(perf_metrics_t *metric_info) {
+    metric_info->t_type = BW;
+    set_metric_defaults(metric_info);
+
+    metric_info->unit = MB;
+    metric_info->target_data = false;
+    metric_info->cstyle = COMM_PAIRWISE;
+    metric_info->opstyle = STYLE_RMA;
 }
 
-static int error_checking_init_target_usage(perf_metrics_t *metric_info) {
-    int error = false;
-    assert(metric_info->midpt > 0);
-
-    if(metric_info->sztarget != -1 && metric_info->szinitiator != -1)
-        error = true; /* can't use them together  */
-
-    if(metric_info->sztarget != -1) {
-         if(metric_info->sztarget < 1 || metric_info->sztarget > metric_info->midpt
-            || !metric_info->target_data)
-            error = true;
-    } else {
-        metric_info->sztarget = metric_info->midpt;
-    }
-
-    if(metric_info->szinitiator != -1) {
-        if(metric_info->szinitiator < 1 || metric_info->szinitiator > metric_info->midpt
-            || !metric_info->target_data)
-            error = true;
-    } else {
-        metric_info->szinitiator = metric_info->midpt;
-    }
-
-    if(error) {
-        fprintf(stderr, "invalid usage of command line arg -r/-l, use --help for info\n");
-        return -1;
-    }
-    return 0;
-}
-
-/* must use shmem_init beforehand */
-static int data_runtime_update(perf_metrics_t *data) {
-    data->my_node = shmem_my_pe();
-    data->num_pes = shmem_n_pes();
-    assert(data->num_pes);
-    data->midpt = data->num_pes/2;
-    return error_checking_init_target_usage(data);
-}
-
-static const char * dt_names [] = { "int", "long", "longlong" };
-
-void static bi_dir_data_init(perf_metrics_t * data) {
-    data->bw_type = "Bi-dir";
-    data->type = BI_DIR;
-}
-
-void static uni_dir_data_init(perf_metrics_t * data) {
-    data->bw_type = "Uni-dir";
-    data->type = UNI_DIR;
-}
-
-
-int static inline partner_node(perf_metrics_t my_info)
-{
-    if(my_info.num_pes == 1)
-        return 0;
-
-    if(my_info.cstyle == COMM_PAIRWISE) {
-        int pairs = my_info.midpt;
-
-        return (my_info.my_node < pairs ? (my_info.my_node + pairs) :
-            (my_info.my_node - pairs));
-    } else {
-        assert(my_info.cstyle == COMM_INCAST);
-        return INCAST_PE;
-    }
-}
-
-int static inline streaming_node(perf_metrics_t my_info)
-{
-    if(my_info.cstyle == COMM_PAIRWISE) {
-        return (my_info.my_node < my_info.szinitiator);
-    } else {
-        assert(my_info.cstyle == COMM_INCAST);
-        return true;
-    }
-}
-
-static int inline is_streaming_node(perf_metrics_t my_info, int node)
-{
-    if(my_info.cstyle == COMM_PAIRWISE) {
-        return (node < my_info.szinitiator);
+static 
+void update_bw_type(perf_metrics_t *data, int b_type) {
+    if (b_type == BI_DIR) {
+        data->bw_type_str = "Bi-dir";
+        data->b_type = BI_DIR;
     } else {
-        assert(my_info.cstyle == COMM_INCAST);
-        return true;
+        data->bw_type_str = "Uni-dir";
+        data->b_type = UNI_DIR;
     }
 }
 
-int static inline target_node(perf_metrics_t my_info)
-{
-    return (my_info.my_node >= my_info.midpt &&
-        (my_info.my_node < (my_info.midpt + my_info.sztarget)));
-}
-
-/* put/get bw use opposite streaming/validate nodes */
-red_PE_set static inline validation_set(perf_metrics_t my_info, int *nPEs)
-{
-    if(my_info.cstyle == COMM_PAIRWISE) {
-        if(streaming_node(my_info)) {
-            *nPEs = my_info.szinitiator;
-            return FIRST_HALF;
-        } else if(target_node(my_info)) {
-            *nPEs = my_info.sztarget;
-            return SECOND_HALF;
-        } else {
-            fprintf(stderr, "Warning: you are getting data from a node that "
-                "wasn't a part of the perf set \n ");
-	    return 0;
-        }
-    } else {
-        assert(my_info.cstyle == COMM_INCAST);
-        *nPEs = my_info.num_pes;
-        return FULL_SET;
-    }
-}
-
-/**************************************************************/
-/*                   Input Checking                           */
-/**************************************************************/
-
-static int command_line_arg_check(int argc, char *argv[],
-                                  perf_metrics_t *metric_info) {
-    int ch, error = false;
-    extern char *optarg;
-
-    /* check command line args */
-    while ((ch = getopt(argc, argv, "e:s:n:w:p:r:l:kbivtC:T:")) != EOF) {
-        switch (ch) {
-        case 's':
-            metric_info->start_len = strtoul(optarg, (char **)NULL, 0);
-            if ( metric_info->start_len < 1 ) metric_info->start_len = 1;
-            if(!is_pow_of_2(metric_info->start_len)) {
-                fprintf(stderr, "Error: start_length must be a power of two\n");
-                error = true;
-            }
-            if (metric_info->start_len > INT_MAX) {
-                fprintf(stderr, "Error: start_length is out of integer range\n");
-                error = true;
-            }
-            break;
-        case 'e':
-            metric_info->max_len = strtoul(optarg, (char **)NULL, 0);
-            if(!is_pow_of_2(metric_info->max_len)) {
-                fprintf(stderr, "Error: end_length must be a power of two\n");
-                error = true;
-            }
-            if(metric_info->max_len < metric_info->start_len) {
-                fprintf(stderr, "Error: end_length (%ld) must be >= "
-                        "start_length (%ld)\n", metric_info->max_len,
-                        metric_info->start_len);
-                error = true;
-            }
-            if (metric_info->max_len > INT_MAX) {
-                fprintf(stderr, "Error: end_length is out of integer range\n");
-                error = true;
-            }
-            break;
-        case 'n':
-            metric_info->trials = strtoul(optarg, (char **)NULL, 0);
-            if(metric_info->trials < (metric_info->warmup*2)) {
-                fprintf(stderr, "Error: trials (%ld) must be >= 2*warmup "
-                        "(%ld)\n", metric_info->trials, metric_info->warmup*2);
-                error = true;
-            }
-            break;
-        case 'p':
-            metric_info->warmup = strtoul(optarg, (char **)NULL, 0);
-            if(metric_info->warmup > (metric_info->trials/2)) {
-                fprintf(stderr, "Error: warmup (%ld) must be <= trials/2 "
-                        "(%ld)\n", metric_info->warmup, metric_info->trials/2);
-                error = true;
-            }
-            break;
-        case 'k':
-            metric_info->unit = KB;
-            break;
-        case 'b':
-            metric_info->unit = B;
-            break;
-        case 'v':
-            metric_info->validate = true;
-            if(metric_info->target_data) error = true;
-            break;
-        case 'w':
-            metric_info->window_size = strtoul(optarg, (char **)NULL, 0);
-            if(metric_info->target_data) error = true;
-            break;
-        case 't':
-            metric_info->target_data = true;
-            metric_info->window_size = 1;
-            if(metric_info->validate) error = true;
-            break;
-        case 'r':
-            metric_info->sztarget = strtoul(optarg, (char **)NULL, 0);
-            break;
-        case 'l':
-            metric_info->szinitiator = strtoul(optarg, (char **)NULL, 0);
-            break;
-        case 'C':
-            if (strcmp(optarg, "SINGLE") == 0) {
-                metric_info->thread_safety = SHMEM_THREAD_SINGLE;
-            } else if (strcmp(optarg, "FUNNELED") == 0) {
-                metric_info->thread_safety = SHMEM_THREAD_FUNNELED;
-            } else if (strcmp(optarg, "SERIALIZED") == 0) {
-                metric_info->thread_safety = SHMEM_THREAD_SERIALIZED;
-            } else if (strcmp(optarg, "MULTIPLE") == 0) {
-                metric_info->thread_safety = SHMEM_THREAD_MULTIPLE;
-            } else {
-                fprintf(stderr, "Invalid threading level: \"%s\"\n", optarg);
-                error = true;
-            }
-            break;
-        case 'T':
-            metric_info->nthreads = atoi(optarg);
-            break;
-        case 'i':
-            metric_info->individual_report = 1;
-            break;
-        default:
-            error = true;
-            break;
-        }
-    }
-
-    /* filling in 8/4KB chunks into array alloc'd to max_len */
-    if(metric_info->target_data) {
-        metric_info->start_len = TARGET_SZ_MIN;
-        if((metric_info->max_len <
-            ((metric_info->trials + metric_info->warmup) * TARGET_SZ_MIN)) ||
-            (metric_info->max_len <
-            ((metric_info->trials + metric_info->warmup) * TARGET_SZ_MAX))) {
-                error = true;
-            }
-    }
-
-    if (error) {
-        if (metric_info->my_node == 0) {
-            fprintf(stderr, "Usage: \n[-s start_length] [-e end_length] "
-                    ": lengths should be a power of two \n"
-                    "[-n trials (must be greater than 2*warmup (default: x => 100))] \n"
-                    "[-p warm-up (see trials for value restriction)] \n"
-                    "[-w window size - iterations between completion, cannot use with -t] \n"
-                    "[-k (kilobytes/second)] [-b (bytes/second)] \n"
-                    "[-v (validate data stream)] \n"
-                    "[-i (turn on individual bandwidth reporting)] \n"
-                    "[-t output data for target side (default is initiator,"
-                    " only use with put_bw),\n cannot be used in conjunction "
-                    "with validate, special sizes used, \ntrials"
-                    " + warmup * sizes (8/4KB) <= max length \n"
-                    "[-r number of nodes at target, use only with -t] \n"
-                    "[-l number of nodes at initiator, use only with -t, "
-                    "l/r cannot be used together] \n"
-                    "[-C thread-safety-config: SINGLE, FUNNELED, SERIALIZED, or MULTIPLE] \n"
-                    "[-T num-threads] \n");
-        }
-        return -1;
-    }
-    return 0;
-}
-
-static inline int only_even_PEs_check(int my_node, int num_pes) {
-    if (num_pes % 2 != 0) {
-        if (my_node == 0) {
-            fprintf(stderr, "Only even number of nodes can be used\n");
-        }
-        return 77;
-    } else
-        return 0;
-}
-
 /**************************************************************/
 /*                   Result Printing and Calc                 */
 /**************************************************************/
 
-static const char *thread_safety_str(perf_metrics_t *metric_info) {
-    if (metric_info->thread_safety == SHMEM_THREAD_SINGLE) {
-        return "SINGLE";
-    } else if (metric_info->thread_safety == SHMEM_THREAD_FUNNELED) {
-        return "FUNNELED";
-    } else if (metric_info->thread_safety == SHMEM_THREAD_SERIALIZED) {
-        return "SERIALIZED";
-    } else if (metric_info->thread_safety == SHMEM_THREAD_MULTIPLE) {
-        return "MULTIPLE";
-    } else {
-        fprintf(stderr, "Unexpected thread safety value: %d. Setting it to SINGLE\n", metric_info->thread_safety);
-        metric_info->thread_safety = SHMEM_THREAD_SINGLE;
-        return "SINGLE";
-    }
-}
+static 
+void print_atomic_header(perf_metrics_t * const metric_info) {
+    print_header(metric_info);
+    printf("\n\nBandwidth test type:    %10s\n", metric_info->bw_type_str);
 
-static void inline thread_safety_validation_check(perf_metrics_t *metric_info) {
-    if (metric_info->nthreads == 1)
-        return;
-    else {
-        if (metric_info->thread_safety != SHMEM_THREAD_MULTIPLE) {
-            if(metric_info->my_node == 0) {
-                fprintf(stderr, "Warning: argument \"-T %d\" is ignored because of the thread level specified." 
-                            " Switching to single thread with thread safety %s\n", metric_info->nthreads, 
-                            thread_safety_str(metric_info));
-            }
-            metric_info->nthreads = 1;
-        }
-        return;
-    }
-}
-
-void static print_atomic_results_header(perf_metrics_t metric_info) {
-    printf("\nSandia OpenSHMEM Performance Suite\n");
-    printf("==================================\n");
-    printf("Total Number of PEs:    %10d\n", metric_info.num_pes);
-    printf("Iteration count:        %10lu\n", metric_info.trials);
-    printf("Window size:            %10lu\n", metric_info.window_size);
-    printf("Bandwidth test type:    %10s\n", metric_info.bw_type);
-
-    if (metric_info.cstyle == COMM_INCAST) {
+    if (metric_info->cstyle == COMM_INCAST) {
         printf("Communication style:        INCAST\n");
     } else {
-        assert(metric_info.cstyle == COMM_PAIRWISE);
+        assert(metric_info->cstyle == COMM_PAIRWISE);
         printf("Communication style:      PAIRWISE\n");
     }
 
     printf("\nOperation%15sBandwidth%15sMessage Rate%15sLatency\n", 
             " ", " ", " ");
 
-    if (metric_info.unit == MB) {
+    if (metric_info->unit == MB) {
         printf("%19s in mbytes/sec"," ");
-    } else if (metric_info.unit == KB) {
+    } else if (metric_info->unit == KB) {
         printf("%19s in kbytes/sec", " ");
     } else {
         printf("%20s in bytes/sec", " ");
@@ -464,26 +86,17 @@ void static print_atomic_results_header(perf_metrics_t metric_info) {
     printf("%15s in Mops/sec%15s  in us\n", " ", " ");
 }
 
-void static print_results_header(perf_metrics_t metric_info) {
-    printf("\nSandia OpenSHMEM Performance Suite\n");
-    printf("==================================\n");
-    printf("Total Number of PEs:    %10d\n", metric_info.num_pes);
-    printf("Number of source PEs:   %10d\n", metric_info.szinitiator);
-    printf("Number of target PEs:   %10d\n", metric_info.sztarget);
-    printf("Iteration count:        %10lu\n", metric_info.trials);
-    printf("Window size:            %10lu\n", metric_info.window_size);
-    printf("Maximum message size:   %10lu\n", metric_info.max_len);
-    printf("Number of threads:      %10d\n", metric_info.nthreads);
-    printf("Thread safety:          %10s\n", thread_safety_str(&metric_info));
-    printf("Bandwidth test type:    %10s\n", metric_info.bw_type);
-
-    printf("\nMessage Size%15sBandwidth%15sMessage Rate\n", 
-           " ", " ");
+static 
+void print_bw_header(perf_metrics_t * const metric_info) {
+    print_header(metric_info);
+    printf("\n\nBandwidth test type:    %10s\n", metric_info->bw_type_str);
+
+    printf("\nMessage Size%15sBandwidth%15sMessage Rate\n", " ", " ");
 
     printf("%4sin bytes", " ");
-    if (metric_info.unit == MB) {
+    if (metric_info->unit == MB) {
         printf("%11sin mbytes/sec", " ");
-    } else if (metric_info.unit == KB) {
+    } else if (metric_info->unit == KB) {
         printf("%11sin kbytes/sec", " ");
     } else {
         printf("%12sin bytes/sec", " ");
@@ -492,83 +105,69 @@ void static print_results_header(perf_metrics_t metric_info) {
     printf("%16sin msgs/sec\n", " ");
 }
 
-void static print_data_results(double bw, double mr, perf_metrics_t data,
+static 
+void print_data_results(double bw, double mr, const perf_metrics_t * const data,
                             int len, double total_t) {
     static int atomic_type_index = 0;
 
-    if(data.target_data) {
-        if(data.my_node < data.midpt) {
-            printf("initiator:\n");
-        } else  {
-            printf("target:\n");
-        }
-    }
-
-    if (data.bwstyle == STYLE_ATOMIC) {
+    if (data->opstyle == STYLE_ATOMIC) {
         printf("%-10s", dt_names[atomic_type_index]);
         atomic_type_index = (atomic_type_index + 1) % ATOMICS_N_DTs;
     } else
         printf("%2s%10d", " ", len);
 
-    if(data.unit == KB) {
+    if(data->unit == KB) {
         bw = bw * 1.0e3;
-    } else if(data.unit == B) {
+    } else if(data->unit == B) {
         bw = bw * 1.0e6;
     }
 
-    if (data.bwstyle == STYLE_ATOMIC) {
-        printf("%13s%10.2f%15s%12.2f%12s%10.2f\n", " ", bw, " ", 
-                mr/1.0e6, " ", total_t/(data.trials * data.window_size));
+    if (data->opstyle == STYLE_ATOMIC) {
+        printf("%13s%10.2f%15s%12.2f%12s%10.2f", " ", bw, " ", 
+                mr/1.0e6, " ", total_t/(data->trials * data->window_size));
     } else
-        printf("%14s%10.2f%15s%12.2f\n", " ", bw, " ", mr);
-}
-
-
-/* reduction to collect performance results from PE set
-    then start_pe will print results --- assumes num_pes is even */
-void static inline PE_set_used_adjustments(int *nPEs, int *stride, int *start_pe,
-                                            perf_metrics_t my_info)
-{
-    red_PE_set PE_set = validation_set(my_info, nPEs);
+        printf("%14s%10.2f%15s%12.2f", " ", bw, " ", mr);
 
-    if(PE_set == FIRST_HALF || PE_set == FULL_SET) {
-        *start_pe = 0;
-    }
-    else {
-        assert(PE_set == SECOND_HALF);
-        *start_pe = my_info.midpt;
+    if(data->target_data) {
+        if(data->my_node < data->szinitiator) {
+            printf("%2sIniter", " ");
+        } else  {
+            printf("%2sTarget", " ");
+        }
     }
 
-    *stride = 0; /* back to back PEs */
+    printf("\n");
 }
 
-
-void static inline calc_and_print_results(double end_t, double start_t, int len,
-                            perf_metrics_t metric_info)
-{
+static inline 
+void calc_and_print_results(double end_t, double start_t, int len, 
+                            perf_metrics_t * const metric_info) {
     int stride = 0, start_pe = 0, nPEs = 0;
     static double pe_bw_sum, bw = 0.0; /*must be symmetric for reduction*/
     double pe_bw_avg = 0.0, pe_mr_avg = 0.0;
     int nred_elements = 1;
     static double pwrk[SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-    static double pe_time_start, pe_time_end, end_time_max = 0.0, start_time_min = 0.0;
+    static double pe_time_start, pe_time_end, 
+                  end_time_max = 0.0, start_time_min = 0.0;
     double total_t = 0.0, total_t_max = 0.0;
     int multiplier = 1;
 
     PE_set_used_adjustments(&nPEs, &stride, &start_pe, metric_info);
 
     /* 2x as many messages at once for bi-directional */
-    if(metric_info.type == BI_DIR)
+    if(metric_info->b_type == BI_DIR)
         multiplier = 2;
 
     if (end_t > 0 && start_t > 0 && (end_t - start_t) > 0) {
         total_t = end_t - start_t;
 #ifdef ENABLE_OPENMP
-        bw = ((double) len * (double) multiplier / 1.0e6 * metric_info.window_size * metric_info.trials *
-                (double) metric_info.nthreads) / (total_t / 1.0e6);
+        bw = ((double) len * (double) metric_info->num_partners * (double) multiplier / 1.0e6 * 
+             metric_info->window_size * metric_info->trials *
+             (double) metric_info->nthreads) / (total_t / 1.0e6);
 #else
-        bw = ((double) len * (double) multiplier / 1.0e6 * metric_info.window_size * metric_info.trials) /
-                (total_t / 1.0e6);
+        bw = ((double) len * (double) metric_info->num_partners * (double) multiplier / 1.0e6 * 
+             metric_info->window_size * metric_info->trials) /
+             (total_t / 1.0e6);
 #endif
     } else {
         fprintf(stderr, "Incorrect time measured from bandwidth test: "
@@ -578,90 +177,102 @@ void static inline calc_and_print_results(double end_t, double start_t, int len,
     /* base case: will be overwritten by collective if num_pes > 2 */
     pe_bw_sum = bw;
 
-    if (metric_info.individual_report == 1) {
-        printf("Individual bandwith for PE %6d is %10.2f\n", 
-                metric_info.my_node, pe_bw_sum);
+    if (metric_info->individual_report == 1) {
+        if (metric_info->my_node < metric_info->midpt) {
+            printf("Individual bandwith for PE %6d (initer) is %10.2f\n", 
+                metric_info->my_node, pe_bw_sum);
+        } else {
+            printf("Individual bandwith for PE %6d (target) is %10.2f\n", 
+                metric_info->my_node, pe_bw_sum);
+        }
     }
     
     pe_time_start = start_t;
     pe_time_end = end_t;
     shmem_barrier(start_pe, stride, nPEs, bar_psync);
-    if (nPEs >= 2) {
-        shmem_double_min_to_all(&start_time_min, &pe_time_start, nred_elements,
+    if (metric_info->cstyle != COMM_INCAST) {  
+        if (nPEs >= 2) {
+            shmem_double_min_to_all(&start_time_min, &pe_time_start, nred_elements,
                                 start_pe, stride, nPEs, pwrk,
                                 red_psync);
-        shmem_barrier(start_pe, stride, nPEs, bar_psync);
-        shmem_double_max_to_all(&end_time_max, &pe_time_end, nred_elements, 
+            shmem_barrier(start_pe, stride, nPEs, bar_psync);
+            shmem_double_max_to_all(&end_time_max, &pe_time_end, nred_elements, 
                                 start_pe, stride, nPEs, pwrk,
                                 red_psync);
-    } else if (nPEs == 1) {
-        start_time_min = pe_time_start;
-        end_time_max = pe_time_end;
-    }
+        } else if (nPEs == 1) {
+            start_time_min = pe_time_start;
+            end_time_max = pe_time_end;
+        }
 
-    /* calculating bandwidth based on the highest time duration across all PEs */
-    if (end_time_max > 0 && start_time_min > 0 && 
-       (end_time_max - start_time_min) > 0) {
+        /* calculating bandwidth based on the highest time duration across all PEs */
+        if (end_time_max > 0 && start_time_min > 0 && 
+           (end_time_max - start_time_min) > 0) {
 
-        total_t_max = (end_time_max - start_time_min);
+            total_t_max = (end_time_max - start_time_min);
+            int total_transfers = MAX(metric_info->szinitiator, metric_info->sztarget);
 #ifdef ENABLE_OPENMP
-        bw = ((double) len * (double) multiplier * (double) metric_info.midpt / 1.0e6 * metric_info.window_size * 
-              metric_info.trials * (double) metric_info.nthreads) / 
-              (total_t_max / 1.0e6);
+            bw = ((double) len * (double) multiplier * (double) total_transfers / 
+                 1.0e6 * metric_info->window_size * metric_info->trials * 
+                 (double) metric_info->nthreads) / (total_t_max / 1.0e6);
 #else
-        bw = ((double) len * (double) multiplier * (double) metric_info.midpt / 1.0e6 * metric_info.window_size * 
-              metric_info.trials) / (total_t_max / 1.0e6);
+            bw = ((double) len * (double) multiplier * (double) total_transfers / 
+                 1.0e6 * metric_info->window_size * metric_info->trials) / 
+                 (total_t_max / 1.0e6);
 #endif
-    } else {
-        fprintf(stderr, "Incorrect time measured from bandwidth test: "
+        } else {
+            fprintf(stderr, "Incorrect time measured from bandwidth test: "
                         "start_min = %lf, end_max = %lf\n", 
                          start_time_min, end_time_max);
-    } 
-
-    pe_bw_sum = bw;
+        } 
+        pe_bw_sum = bw;
+    } else {
+        if (nPEs >= 2) {
+            shmem_double_sum_to_all(&pe_bw_sum, &bw, nred_elements,
+                                start_pe, stride, nPEs, pwrk,
+                                red_psync);
+        } else if (nPEs == 1) {
+            pe_bw_sum = bw;
+        }
+    }
 
     /* aggregate bw since bw op pairs are communicating simultaneously */
-    if(metric_info.my_node == start_pe) {
+    if(metric_info->my_node == start_pe) {
         pe_bw_avg = pe_bw_sum;
         pe_mr_avg = pe_bw_avg / (len / 1.0e6);
         print_data_results(pe_bw_avg, pe_mr_avg, metric_info, len, total_t);
     }
 }
 
-void static inline large_message_metric_chg(perf_metrics_t *metric_info, int len) {
-    if(len > LARGE_MESSAGE_SIZE) {
-        metric_info->window_size = WINDOW_SIZE_LARGE;
-        metric_info->trials = TRIALS_LARGE;
-        metric_info->warmup = WARMUP_LARGE;
-    }
-}
-
-static void validate_atomics(perf_metrics_t m_info) {
+static int validate_atomics(perf_metrics_t * const m_info) {
     int snode = streaming_node(m_info);
-    int * my_buf = (int *)m_info.dest;
-    bw_type tbw = m_info.type;
-    int expected_val = 0;
-    unsigned int ppe_exp_val = ((m_info.trials + m_info.warmup) * m_info.window_size
-                                * ATOMICS_N_DTs * ATOMICS_N_OPs) + m_info.my_node;
-
-    if(m_info.cstyle == COMM_INCAST) {
-        if(tbw == BI_DIR)
+    int * my_buf = (int *)m_info->dest;
+    bw_type tbw = m_info->b_type;
+    int expected_val = 0, errors = 0;
+    unsigned int ppe_exp_val = ((m_info->trials + m_info->warmup) * m_info->window_size
+                                * ATOMICS_N_DTs * ATOMICS_N_OPs) + m_info->my_node;
+
+    if (m_info->cstyle == COMM_INCAST) {
+        if (tbw == BI_DIR) 
             printf("WARNING: This use-case is not currently well defined\n");
 
-        if(m_info.my_node == 0) {
-            expected_val = ppe_exp_val * m_info.num_pes;
+        if (m_info->my_node == 0) {
+            expected_val = ppe_exp_val * m_info->num_pes;
         } else
-            expected_val = m_info.my_node;
+            expected_val = m_info->my_node;
     } else {
-        assert(m_info.cstyle == COMM_PAIRWISE);
+        assert(m_info->cstyle == COMM_PAIRWISE);
         expected_val = ppe_exp_val;
     }
 
-    if((!snode && tbw == UNI_DIR) || tbw == BI_DIR) {
-        if(my_buf[0] != expected_val)
-            printf("validation error for PE %d: %d != %d \n", m_info.my_node, my_buf[0],
+    if ((!snode && tbw == UNI_DIR) || tbw == BI_DIR) {
+        if(my_buf[0] != expected_val) {
+            printf("Validation error for PE %d: %d != %d \n", m_info->my_node, my_buf[0],
                     expected_val);
+            errors++;
+        }
     }
+
+    return errors;
 }
 
 /**************************************************************/
@@ -673,33 +284,37 @@ static void validate_atomics(perf_metrics_t m_info) {
  * NOTE: post function validation assumptions, data isn't flushed pre/post */
 extern void bi_dir_bw(int len, perf_metrics_t *metric_info);
 
-void static inline bi_dir_bw_test_and_output(perf_metrics_t metric_info) {
+static inline 
+void bi_dir_bw_test_and_output(perf_metrics_t * const metric_info) {
     int partner_pe = partner_node(metric_info);
     unsigned long int len;
 
-    if(metric_info.my_node == 0) {
-        if (metric_info.bwstyle == STYLE_ATOMIC)
-            print_atomic_results_header(metric_info);
+    if(metric_info->my_node == 0) {
+        if (metric_info->opstyle == STYLE_ATOMIC)
+            print_atomic_header(metric_info);
         else
-            print_results_header(metric_info);
+            print_bw_header(metric_info);
     }
 
-    for (len = metric_info.start_len; len <= metric_info.max_len;
-        len *= metric_info.size_inc) {
+    for (len = metric_info->start_len; len <= metric_info->max_len;
+        len *= metric_info->size_inc) {
 
-        large_message_metric_chg(&metric_info, len);
+        large_message_metric_chg(metric_info, len);
 
-        bi_dir_bw(len, &metric_info);
+        bi_dir_bw(len, metric_info);
     }
 
     shmem_barrier_all();
 
-    if(metric_info.validate) {
-        if(metric_info.bwstyle != STYLE_ATOMIC) {
-            validate_recv(metric_info.dest, metric_info.max_len, partner_pe);
+    if (metric_info->validate) {
+        int errors = -1;
+        if (metric_info->opstyle != STYLE_ATOMIC) {
+            errors = validate_recv(metric_info->dest, metric_info->max_len, partner_pe);
         } else {
-            validate_atomics(metric_info);
+            errors = validate_atomics(metric_info);
         }
+        if (errors >= 0) 
+            printf("Validation complete (%d errors)\n", errors);
     }
 }
 
@@ -707,39 +322,43 @@ void static inline bi_dir_bw_test_and_output(perf_metrics_t metric_info) {
 /*                   UNI-Directional BW                       */
 /**************************************************************/
 
-/*have one symmetric char array metric_info->buf of max_len to use for
+/* have one symmetric char array metric_info->buf of max_len to use for
  * calculation initalized with my_node number
  * NOTE: post function validation assumptions, data isn't flushed pre/post */
 extern void uni_dir_bw(int len, perf_metrics_t *metric_info);
 
-void static inline uni_dir_bw_test_and_output(perf_metrics_t metric_info) {
+static inline 
+void uni_dir_bw_test_and_output(perf_metrics_t * const metric_info) {
     int partner_pe = partner_node(metric_info);
     unsigned long int len = 0;
 
-    if(metric_info.my_node == 0) {
-        if (metric_info.bwstyle == STYLE_ATOMIC)
-            print_atomic_results_header(metric_info);
+    if(metric_info->my_node == 0) {
+        if (metric_info->opstyle == STYLE_ATOMIC)
+            print_atomic_header(metric_info);
         else
-            print_results_header(metric_info);
+            print_bw_header(metric_info);
     }
 
-    for (len = metric_info.start_len; len <= metric_info.max_len;
-        len *= metric_info.size_inc) {
+    for (len = metric_info->start_len; len <= metric_info->max_len;
+        len *= metric_info->size_inc) {
 
-        large_message_metric_chg(&metric_info, len);
+        large_message_metric_chg(metric_info, len);
 
-        uni_dir_bw(len, &metric_info);
+        uni_dir_bw(len, metric_info);
     }
 
     shmem_barrier_all();
 
-    if(metric_info.validate) {
-        if((streaming_node(metric_info) && metric_info.bwstyle == STYLE_GET) ||
-            (target_node(metric_info) && metric_info.bwstyle == STYLE_PUT)) {
-            validate_recv(metric_info.dest, metric_info.max_len, partner_pe);
-        } else if(metric_info.bwstyle == STYLE_ATOMIC) {
-            validate_atomics(metric_info);
+    if (metric_info->validate) {
+        int errors = -1;
+        if ((streaming_node(metric_info) && metric_info->opstyle == STYLE_GET) ||
+            (target_node(metric_info) && metric_info->opstyle == STYLE_PUT)) {
+            errors = validate_recv(metric_info->dest, metric_info->max_len, partner_pe);
+        } else if (metric_info->opstyle == STYLE_ATOMIC) {
+            errors = validate_atomics(metric_info);
         }
+        if (errors >= 0) 
+            printf("Validation complete (%d errors)\n", errors);
     }
 }
 
@@ -747,187 +366,132 @@ void static inline uni_dir_bw_test_and_output(perf_metrics_t metric_info) {
 /*                   INIT and teardown of resources           */
 /**************************************************************/
 
-/*create and init (with my_PE_num) two symmetric arrays on the heap */
-static inline int bw_init_data_stream(perf_metrics_t *metric_info,
-                                            int argc, char *argv[]) {
+/* create and init (with my_PE_num) two symmetric arrays on the heap */
+static inline 
+int bw_init_data_stream(perf_metrics_t * const metric_info,
+                        int argc, char *argv[]) {
 
-    int i = 0;
-    data_set_defaults(metric_info);
+    init_metrics(metric_info);
     int ret = command_line_arg_check(argc, argv, metric_info);
-    if (ret != 0) {
-        return -1;
-    }
 
 #ifndef VERSION_1_0
+#if defined(ENABLE_THREADS)
     int tl;
     shmem_init_thread(metric_info->thread_safety, &tl);
     if(tl != metric_info->thread_safety) {
         fprintf(stderr,"Could not initialize with requested thread "
                 "level %d: got %d\n", metric_info->thread_safety, tl);
-        return -2;
+        return -1;
     }
+#else
+    shmem_init();
+#endif
 #else
     start_pes(0);
 #endif
 
-    if (data_runtime_update(metric_info) == -1)
-        return -2;	
-    thread_safety_validation_check(metric_info);
-    metric_info->sztarget = metric_info->midpt;
-    metric_info->szinitiator = metric_info->midpt;
+    update_metrics(metric_info);
 
-    for(i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++)
-        red_psync[i] = SHMEM_SYNC_VALUE;
+    if (ret) {
+        if (metric_info->my_node == 0) {
+            print_usage(ret);
+        }
+        return -1;
+    } else {
+        if (metric_info->num_pes < 2) {
+            fprintf(stderr, "This test requires at least two processes.\n");
+            print_usage(1);
+            return -1;
+        }
+    }
 
-    for(i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++)
-        bar_psync[i] = SHMEM_SYNC_VALUE;
+    if (error_checking_init_target_usage(metric_info) == -1)
+        return -1;
+#if defined(ENABLE_THREADS)
+    thread_safety_validation_check(metric_info);
+#endif
+    init_psync_arrays();
 
-    if (only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) {
-        return -2;
+    if(only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) {
+        return -1;
     }
 
-    metric_info->src = aligned_buffer_alloc(metric_info->max_len);
-    init_array(metric_info->src, metric_info->max_len, metric_info->my_node);
+    metric_info->src = aligned_buffer_alloc(metric_info->max_len * metric_info->nthreads);
+    init_array(metric_info->src, metric_info->max_len * metric_info->nthreads, metric_info->my_node);
 
-    metric_info->dest = aligned_buffer_alloc(metric_info->max_len);
-    init_array(metric_info->dest, metric_info->max_len, metric_info->my_node);
+    metric_info->dest = aligned_buffer_alloc(metric_info->max_len * metric_info->nthreads);
+    init_array(metric_info->dest, metric_info->max_len * metric_info->nthreads, metric_info->my_node);
 
     return 0;
 }
 
 
-static inline int bi_dir_init(perf_metrics_t *metric_info, int argc,
-                                char *argv[]) {
+static inline 
+int bi_dir_init(perf_metrics_t * const metric_info, int argc,
+                char *argv[], op_style opstyle) {
     int ret = bw_init_data_stream(metric_info, argc, argv);
     if (ret == 0) {
-        bi_dir_data_init(metric_info);
+        metric_info->opstyle = opstyle;
+        update_bw_type(metric_info, BI_DIR);
         return 0;
     } else 
         return ret;
 }
 
-static inline int uni_dir_init(perf_metrics_t *metric_info, int argc,
-                                char *argv[], bw_style bwstyl) {
+static inline 
+int uni_dir_init(perf_metrics_t * const metric_info, int argc,
+                 char *argv[], op_style opstyle) {
     int ret = bw_init_data_stream(metric_info, argc, argv);
     if (ret == 0) {
         /* uni-dir validate needs to know if its a put or get */
-        metric_info->bwstyle = bwstyl;
-        uni_dir_data_init(metric_info);
+        metric_info->opstyle = opstyle;
+        update_bw_type(metric_info, UNI_DIR);
         return 0;
     } else 
         return ret;
 }
 
-void static inline bw_data_free(perf_metrics_t *metric_info) {
+static inline 
+void bw_data_free(const perf_metrics_t * const metric_info) {
     shmem_barrier_all();
 
     aligned_buffer_free(metric_info->src);
     aligned_buffer_free(metric_info->dest);
 }
 
-static void inline bw_finalize(void) {
+static inline 
+void bw_finalize(void) {
 #ifndef VERSION_1_0
     shmem_finalize();
 #endif
 }
 
-void static inline bi_dir_bw_main(int argc, char *argv[]) {
+static inline 
+void bi_dir_bw_main(int argc, char *argv[], op_style opstyle) {
 
     perf_metrics_t metric_info;
 
-    int ret = bi_dir_init(&metric_info, argc, argv);
+    int ret = bi_dir_init(&metric_info, argc, argv, opstyle);
 
     if (ret == 0) {
-        bi_dir_bw_test_and_output(metric_info);
+        bi_dir_bw_test_and_output(&metric_info);
         bw_data_free(&metric_info);
     }
 
-    if (ret != -1)
-        bw_finalize(); 
-} /*main() */
+    bw_finalize(); 
+} 
 
-void static inline uni_dir_bw_main(int argc, char *argv[], bw_style bwstyl) {
+static inline 
+void uni_dir_bw_main(int argc, char *argv[], op_style opstyle) {
 
     perf_metrics_t metric_info;
 
-    int ret = uni_dir_init(&metric_info, argc, argv, bwstyl);
+    int ret = uni_dir_init(&metric_info, argc, argv, opstyle);
 
     if (ret == 0) {
-        uni_dir_bw_test_and_output(metric_info);
+        uni_dir_bw_test_and_output(&metric_info);
         bw_data_free(&metric_info);
     }
 
-    if (ret != -1)
-        bw_finalize();
-} /*main() */
-
-static inline int check_hostname_validation(perf_metrics_t my_info) {
-
-    int hostname_status = -1;
-
-    /* hostname_size should be a length divisible by 4 */
-    int hostname_size = (MAX_HOSTNAME_LEN % 4 == 0) ? MAX_HOSTNAME_LEN : 
-                         MAX_HOSTNAME_LEN + (4 - MAX_HOSTNAME_LEN % 4);
-    int i, errors = 0;
-
-    /* pSync for fcollect of hostnames */
-    static long pSync_collect[SHMEM_COLLECT_SYNC_SIZE];
-    for (i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++)
-        pSync_collect[i] = SHMEM_SYNC_VALUE;
-
-    char *hostname = (char *) shmem_malloc (hostname_size * sizeof(char));
-    char *dest = (char *) shmem_malloc (my_info.num_pes * hostname_size * sizeof(char));
-
-    hostname_status = gethostname(hostname, hostname_size);
-    if (hostname_status != 0) {
-        fprintf(stderr, "gethostname failed (%d)\n", hostname_status);
-        return -1;
-    }
-    shmem_barrier_all();
-
-    /* nelems needs to be updated based on 32-bit API */
-    shmem_fcollect32(dest, hostname, hostname_size/4, 0, 0, my_info.num_pes, pSync_collect);
-
-    char *snode_name = NULL;
-    char *tnode_name = NULL;
-    for (i = 0; i < my_info.num_pes; i++) {
-        char *curr_name = &dest[i * hostname_size];
-
-        if (is_streaming_node(my_info, i)) {
-            if (snode_name == NULL) {
-                snode_name = curr_name;
-            }
-
-            if (strncmp(snode_name, curr_name, hostname_size) != 0) {
-                fprintf(stderr, "PE %d on %s is a streaming node " 
-                                "but not placed on %s\n", i, curr_name, snode_name);
-                errors++;
-            }
-        } else {
-            if (tnode_name == NULL) {
-                tnode_name = curr_name;
-            }
-
-            if (strncmp(tnode_name, curr_name, hostname_size) != 0) {
-                fprintf(stderr, "PE %d on %s is a target node "
-                                "but not placed on %s\n", i, curr_name, tnode_name);
-                errors++;
-            }
-        }
-    }
-
-    if (snode_name == NULL || tnode_name == NULL) {
-        fprintf(stderr, "Error: no streaming or target node\n");
-        return -1;
-    }
-
-    if (strncmp(snode_name, tnode_name, hostname_size) == 0) {
-        fprintf(stderr, "Warning: senders and receivers are running on the "
-                        "same node %s\n", snode_name);
-    }
-
-    shmem_free(dest);
-    shmem_free(hostname);
-
-    return errors;
-}
+    bw_finalize();
+} 
diff --git a/test/performance/shmem_perf_suite/common.h b/test/performance/shmem_perf_suite/common.h
index c48f66e..2f0b556 100644
--- a/test/performance/shmem_perf_suite/common.h
+++ b/test/performance/shmem_perf_suite/common.h
@@ -35,18 +35,186 @@
 #include <sys/time.h>
 #include <time.h>
 #include <stdint.h>
-
 #include <limits.h>
 #include <sys/param.h>
 
+/* hostname length to check for hostname errors */
 #ifdef MAXHOSTNAMELEN
 #define MAX_HOSTNAME_LEN MAXHOSTNAMELEN
 #else
 #define MAX_HOSTNAME_LEN HOST_NAME_MAX
 #endif
 
+#ifndef MAX
+#define MAX(A,B)   (((A)>(B)) ? (A) : (B))
+#endif
+
 #define ONE 1
 
+/* constants for experiments */
+#define MAX_MSG_SIZE (1<<23)
+#define START_LEN 1
+#define INC 2
+#define TRIALS 1000
+#define WINDOW_SIZE 64
+#define WARMUP 100
+
+/* constants for experiments with large message sizes */
+#define TRIALS_LARGE  100
+#define WINDOW_SIZE_LARGE 64
+#define WARMUP_LARGE  10
+#define LARGE_MESSAGE_SIZE  65536
+
+#define TARGET_SZ_MIN 8
+#define TARGET_SZ_MAX 4096
+
+/* atomics common */
+#define ATOMICS_N_DTs 3
+/* note: ignoring cswap/swap for now in verification */
+#define ATOMICS_N_OPs 4
+/* PE 0 is printing its latency, thus have it not be the INCAST PE*/
+#define INCAST_PE 1
+
+/* perf metrics structures */
+typedef enum {
+    LAT,
+    BW
+} test_type;
+
+typedef enum {
+    UNI_DIR,
+    BI_DIR
+} bw_type;
+
+typedef enum {
+    STYLE_PUT,
+    STYLE_GET,
+    STYLE_RMA,
+    STYLE_ATOMIC
+} op_style;
+
+typedef enum {
+    FIRST_HALF,
+    SECOND_HALF,
+    FULL_SET
+} red_PE_set;
+
+typedef enum {
+    COMM_PAIRWISE,
+    COMM_INCAST
+} comm_style;
+
+typedef enum {
+    B,
+    KB,
+    MB
+} bw_units;
+
+typedef enum {
+    OP_FETCH,
+    OP_SET,
+    OP_CSWAP,
+    OP_SWAP,
+    OP_FINC,
+    OP_INC,
+    OP_FADD,
+    OP_ADD,
+    OP_FAND,
+    OP_AND,
+    OP_FOR,
+    OP_OR,
+    OP_FXOR,
+    OP_XOR,
+    SIZE_OF_OP
+} atomic_op_type;
+
+
+typedef struct perf_metrics {
+    /* common parameters */
+    test_type t_type;
+    unsigned long int start_len, max_len;
+    unsigned long int size_inc, trials;
+    unsigned long int window_size, warmup;
+    int my_node, num_pes, sztarget, szinitiator, midpt;
+    char *src, *dest;
+    op_style opstyle;
+
+    /* parameters for threaded tests */
+    int nthreads;
+    int thread_safety;
+
+    /* parameters specific to bandwidth tests */
+    bw_units unit;
+    const char *bw_type_str;
+    bw_type b_type;
+    comm_style cstyle;
+    int target_data;
+    int num_partners;
+
+    /* parameters specific to latency tests */
+    long *target;
+
+    /* misc parameters */
+    int validate;
+    int individual_report;
+} perf_metrics_t;
+
+/* psync arrays used in metric calculation */
+long red_psync[SHMEM_REDUCE_SYNC_SIZE];
+long bar_psync[SHMEM_BARRIER_SYNC_SIZE];
+
+/* default settings with no input provided */
+static inline 
+void set_metric_defaults(perf_metrics_t *metric_info) {
+    metric_info->start_len = START_LEN;
+    metric_info->max_len = MAX_MSG_SIZE;
+    metric_info->size_inc = INC;
+    metric_info->trials = TRIALS;
+    metric_info->window_size = WINDOW_SIZE; /*back-to-back msg stream*/
+    metric_info->warmup = WARMUP; /*number of initial iterations to skip*/
+
+    metric_info->my_node = -1;
+    metric_info->num_pes = -1;
+    metric_info->midpt = -1;
+    metric_info->sztarget = -1;
+    metric_info->szinitiator = -1;
+
+    metric_info->src = NULL;
+    metric_info->dest = NULL;
+
+    metric_info->num_partners = 1;
+
+#if defined(ENABLE_THREADS)
+    metric_info->thread_safety = SHMEM_THREAD_SINGLE;
+#else
+    metric_info->thread_safety = 0;
+#endif
+    metric_info->nthreads = 1;
+
+    metric_info->validate = false;
+    metric_info->individual_report = -1;
+}
+
+/* update metrics after shmem init */
+static inline 
+void update_metrics(perf_metrics_t *metric_info) {
+    metric_info->my_node = shmem_my_pe();
+    metric_info->num_pes = shmem_n_pes();
+    assert(metric_info->num_pes);
+    metric_info->midpt = metric_info->num_pes / 2;
+}
+
+/* init psync arrays */
+static inline
+void init_psync_arrays(void) {
+    int i;
+    for(i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++)
+        red_psync[i] = SHMEM_SYNC_VALUE;
+
+    for(i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++)
+        bar_psync[i] = SHMEM_SYNC_VALUE;
+}
+
 /* return microseconds */
 double perf_shmemx_wtime(void);
 
@@ -107,7 +275,6 @@ static char * aligned_buffer_alloc(int len)
 
 static void aligned_buffer_free(char * ptr_aligned)
 {
-
     char * ptr_org;
     uintptr_t temp_p;
     size_t ptr_size = sizeof(uintptr_t);
@@ -123,47 +290,506 @@ static void aligned_buffer_free(char * ptr_aligned)
 #endif
 }
 
-int static inline is_divisible_by_4(int num)
-{
-    assert(num >= 0);
-    assert(sizeof(int) == 4);
-    return (!(num & 0x00000003));
+static inline 
+int is_divisible_by_4(int num) {
+    if (num < 0)
+        shmem_global_exit(1);
+    return (num % 4 == 0);
 }
 
 /*to be a power of 2 must only have 1 set bit*/
-int static inline is_pow_of_2(unsigned int num)
-{
+static inline 
+int is_pow_of_2(unsigned int num) {
     /*move first set bit all the way to right*/
     while(num && !((num >>=1 ) & 1));
 
     /*it will be 1 if its the only set bit*/
-    return ((num == 1 || num == 0)? true : false);
+    return ((num == 1 || num == 0) ? true : false);
 }
 
-void static init_array(char * const buf, int len, int my_pe_num)
-{
+static 
+void init_array(const char *buf, int len, int my_pe_num) {
     int i = 0;
     int array_size = len / sizeof(int);
-    int * ibuf = (int *)buf;
+    int *ibuf = (int *)buf;
 
     assert(is_divisible_by_4(len));
 
     for(i = 0; i < array_size; i++)
         ibuf[i] = my_pe_num;
-
 }
 
-void static inline validate_recv(char * buf, int len, int partner_pe)
-{
+static inline 
+int validate_recv(char *buf, int len, int partner_pe) {
     int i = 0;
     int array_size = len / sizeof(int);
-    int * ibuf = (int *)buf;
+    int *ibuf = (int *)buf;
+    int errors = 0;
 
     assert(is_divisible_by_4(len));
 
-    for(i = 0; i < array_size; i++) {
-        if(ibuf[i] != partner_pe)
-            printf("validation error at index %d: %d != %d \n", i, ibuf[i],
-                    partner_pe);
+    for (i = 0; i < array_size; i++) {
+        if (ibuf[i] != partner_pe) {
+            errors++;
+        }
+    }
+    if (errors > 0) {
+        printf("Validation error: stored_value = %d, expected value = %d\n", 
+                                  ibuf[0], partner_pe);
     }
+    return errors;
+}
+
+/**************************************************************/
+/*                   Input Checking                           */
+/**************************************************************/
+
+static
+int command_line_arg_check(int argc, char *argv[], perf_metrics_t * const metric_info) {
+    int ch, errors = 0;
+    extern char *optarg;
+
+    /* check command line args */
+    while ((ch = getopt(argc, argv, "e:s:n:w:p:r:l:kbivtC:T:")) != EOF) {
+        switch (ch) {
+        case 's':
+            metric_info->start_len = strtoul(optarg, (char **)NULL, 0);
+            if ( metric_info->start_len < 1 ) metric_info->start_len = 1;
+            if(!is_pow_of_2(metric_info->start_len)) {
+                fprintf(stderr, "Error: start_length must be a power of two\n");
+                errors++;
+            }
+            if (metric_info->start_len > INT_MAX) {
+                fprintf(stderr, "Error: start_length is out of integer range\n");
+                errors++;
+            }
+            break;
+        case 'e':
+            metric_info->max_len = strtoul(optarg, (char **)NULL, 0);
+            if(!is_pow_of_2(metric_info->max_len)) {
+                fprintf(stderr, "Error: end_length must be a power of two\n");
+                errors++;
+            }
+            if(metric_info->max_len < metric_info->start_len) {
+                fprintf(stderr, "Error: end_length (%ld) must be >= "
+                        "start_length (%ld)\n", metric_info->max_len,
+                        metric_info->start_len);
+                errors++;
+            }
+            if (metric_info->max_len > INT_MAX) {
+                fprintf(stderr, "Error: end_length is out of integer range\n");
+                errors++;
+            }
+            break;
+        case 'n':
+            metric_info->trials = strtoul(optarg, (char **)NULL, 0);
+            if(metric_info->trials < (metric_info->warmup * 2)) {
+                fprintf(stderr, "Error: trials (%ld) must be >= 2*warmup "
+                        "(%ld)\n", metric_info->trials, metric_info->warmup * 2);
+                errors++;
+            }
+            break;
+        case 'p':
+            metric_info->warmup = strtoul(optarg, (char **)NULL, 0);
+            if(metric_info->warmup > (metric_info->trials/2)) {
+                fprintf(stderr, "Error: warmup (%ld) must be <= trials/2 "
+                        "(%ld)\n", metric_info->warmup, metric_info->trials/2);
+                errors++;
+            }
+            break;
+        case 'k':
+            metric_info->unit = KB;
+            if (metric_info->t_type != BW)
+                errors++;
+            break;
+        case 'b':
+            metric_info->unit = B;
+            if (metric_info->t_type != BW)
+                errors++;
+            break;
+        case 'v':
+            metric_info->validate = true;
+            if(metric_info->t_type == BW && metric_info->target_data) 
+                errors++;
+            break;
+        case 'w':
+            metric_info->window_size = strtoul(optarg, (char **)NULL, 0);
+            if (metric_info->t_type != BW) {
+                errors++;
+            } else {
+                if (metric_info->target_data) {
+                    errors++;
+                }
+            }
+            break;
+        case 't':
+            metric_info->target_data = true;
+            if (metric_info->t_type != BW) {
+                errors++;
+            } else {
+                if (metric_info->validate) {
+                    errors++;
+                }
+            }
+            break;
+        case 'r':
+            metric_info->sztarget = strtoul(optarg, (char **)NULL, 0);
+            break;
+        case 'l':
+            metric_info->szinitiator = strtoul(optarg, (char **)NULL, 0);
+            break;
+        case 'C':
+#if defined(ENABLE_THREADS)
+            if (strcmp(optarg, "SINGLE") == 0) {
+                metric_info->thread_safety = SHMEM_THREAD_SINGLE;
+            } else if (strcmp(optarg, "FUNNELED") == 0) {
+                metric_info->thread_safety = SHMEM_THREAD_FUNNELED;
+            } else if (strcmp(optarg, "SERIALIZED") == 0) {
+                metric_info->thread_safety = SHMEM_THREAD_SERIALIZED;
+            } else if (strcmp(optarg, "MULTIPLE") == 0) {
+                metric_info->thread_safety = SHMEM_THREAD_MULTIPLE;
+            } else {
+                fprintf(stderr, "Invalid threading level: \"%s\"\n", optarg);
+                errors++;
+            }
+#else
+            fprintf(stderr, "Threading support disabled. "
+                            "Ignoring threading level: \"%s\"\n", optarg);
+            metric_info->thread_safety = 0;
+#endif
+            break;
+        case 'T':
+            metric_info->nthreads = atoi(optarg);
+            break;
+        case 'i':
+            metric_info->individual_report = 1;
+            break;
+        default:
+            errors++;
+            break;
+        }
+    }
+
+    return errors;
+}
+
+static inline
+void print_usage(int errors) {
+    fprintf(stderr, "\nNumber of errors in the command line: %d\n", errors);
+    fprintf(stderr, "\nUsage: <benchmark executable> [OPTION]\n" 
+           " -s START_MSG_SIZE           Smallest message size. Must be power of 2\n"
+           " -e END_MSG_SIZE             Largest message size. Must be power of 2\n"
+           " -p WARMUP                   Number of warmup iterations\n"
+           " -n TRIALS                   Number of trial iterations. Must be at\n"
+           "                             least twice of WARMUP\n"
+           " -w WINDOW_SIZE              Window size for streaming. Cannot be used\n"
+           "                             in conjunction with -t. Specific to band-\n"
+           "                             -width experiments\n"
+           " -k                          Setting bandwidth metric to kbytes/second\n"
+           " -b                          Setting bandwidth metric to bytes/second\n"
+           " -v                          Turning on validation of data\n"
+           " -i                          Turning on individual process reporting\n"
+           " -t                          Output data for target side (default is \n"
+           "                             initiator, only use with Put Bandwidth),\n"
+           "                             cannot be used in conjunction with \n"
+           "                             validate, special sizes used, trials + \n"
+           "                             warmup * sizes (8/4KB) <= max length \n"
+           " -r TARGET_SIZE              Number of target nodes, use only with -t;\n"
+           " -l SOURCE_SIZE              Number of initiator nodes, use only with\n"
+           "                             -t\n"
+           " -T THREADS                  Number of threads\n"
+           " -C THREAD_LEVEL             SHMEM thread level. Possible values: \n"
+           "                             SINGLE, FUNNELED, SERIALIZED, MULTIPLE \n"
+           );
+}
+
+
+#if defined(ENABLE_THREADS)
+static 
+const char *thread_safety_str(perf_metrics_t * const metric_info) {
+    if (metric_info->thread_safety == SHMEM_THREAD_SINGLE) {
+        return "SINGLE";
+    } else if (metric_info->thread_safety == SHMEM_THREAD_FUNNELED) {
+        return "FUNNELED";
+    } else if (metric_info->thread_safety == SHMEM_THREAD_SERIALIZED) {
+        return "SERIALIZED";
+    } else if (metric_info->thread_safety == SHMEM_THREAD_MULTIPLE) {
+        return "MULTIPLE";
+    } else {
+        fprintf(stderr, "Unexpected thread safety value: %d. " 
+                        "Setting it to SINGLE\n", metric_info->thread_safety);
+        metric_info->thread_safety = SHMEM_THREAD_SINGLE;
+        return "SINGLE";
+    }
+}
+
+static inline 
+void thread_safety_validation_check(perf_metrics_t * const metric_info) {
+    if (metric_info->nthreads == 1)
+        return;
+    else {
+        if (metric_info->thread_safety != SHMEM_THREAD_MULTIPLE) {
+            if(metric_info->my_node == 0) {
+                fprintf(stderr, "Warning: argument \"-T %d\" is ignored"
+                                " because of the thread level specified."
+                                " Switching to single thread with thread" 
+                                " safety %s\n", metric_info->nthreads,
+                                  thread_safety_str(metric_info));
+            }
+            metric_info->nthreads = 1;
+        }
+        return;
+    }
+}
+#endif
+
+static inline 
+int only_even_PEs_check(int my_node, int num_pes) {
+    if (num_pes % 2 != 0) {
+        if (my_node == 0) {
+            fprintf(stderr, "Only even number of processes can be used\n");
+        }
+        return 77;
+    } else
+        return 0;
+}
+
+
+/* Returns partner node; Assumes only one partner */
+static inline 
+int partner_node(const perf_metrics_t * const my_info)
+{
+    if (my_info->num_pes == 1)
+        return 0;
+
+    if (my_info->t_type == BW) {
+        if(my_info->cstyle == COMM_PAIRWISE) {
+            int pairs = my_info->midpt;
+
+            return (my_info->my_node < pairs ? (my_info->my_node + pairs) :
+                   (my_info->my_node - pairs));
+        } else {
+            assert(my_info->cstyle == COMM_INCAST);
+            return INCAST_PE;
+        }
+    } else {
+        int pairs = my_info->midpt;
+
+        return (my_info->my_node < pairs ? (my_info->my_node + pairs) :
+               (my_info->my_node - pairs));
+    }
+}
+
+static inline
+int streaming_node(const perf_metrics_t * const my_info)
+{
+    if(my_info->cstyle == COMM_PAIRWISE) {
+        return (my_info->my_node < my_info->szinitiator);
+    } else {
+        assert(my_info->cstyle == COMM_INCAST);
+        return true;
+    }
+}
+
+static inline
+int target_node(const perf_metrics_t * const my_info)
+{
+    return (my_info->my_node >= my_info->midpt &&
+        (my_info->my_node < (my_info->midpt + my_info->sztarget)));
+}
+
+static inline 
+int is_streaming_node(const perf_metrics_t * const my_info, int node)
+{
+    if (my_info->cstyle == COMM_PAIRWISE) {
+        return (node < my_info->szinitiator);
+    } else {
+        assert(my_info->cstyle == COMM_INCAST);
+        return true;
+    }
+}
+
+static inline
+int check_hostname_validation(const perf_metrics_t * const my_info) {
+
+    int hostname_status = -1;
+
+    /* hostname_size should be a length divisible by 4 */
+    int hostname_size = (MAX_HOSTNAME_LEN % 4 == 0) ? MAX_HOSTNAME_LEN :
+                         MAX_HOSTNAME_LEN + (4 - MAX_HOSTNAME_LEN % 4);
+    int i, errors = 0;
+
+    /* pSync for fcollect of hostnames */
+    static long pSync_collect[SHMEM_COLLECT_SYNC_SIZE];
+    for (i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++)
+        pSync_collect[i] = SHMEM_SYNC_VALUE;
+
+    char *hostname = (char *) shmem_malloc (hostname_size * sizeof(char));
+    char *dest = (char *) shmem_malloc (my_info->num_pes * hostname_size * 
+                                        sizeof(char));
+
+    if (hostname == NULL || dest == NULL) {
+        fprintf(stderr, "shmem_malloc failed to allocate for hostname strings\n");
+        return -1;
+    }
+
+    hostname_status = gethostname(hostname, hostname_size);
+    if (hostname_status != 0) {
+        fprintf(stderr, "gethostname failed (%d)\n", hostname_status);
+        return -1;
+    }
+    shmem_barrier_all();
+
+    /* nelems needs to be updated based on 32-bit API */
+    shmem_fcollect32(dest, hostname, hostname_size/4, 0, 0, my_info->num_pes, 
+                     pSync_collect);
+
+    char *snode_name = NULL;
+    char *tnode_name = NULL;
+    for (i = 0; i < my_info->num_pes; i++) {
+        char *curr_name = &dest[i * hostname_size];
+
+        if (is_streaming_node(my_info, i)) {
+            if (snode_name == NULL) {
+                snode_name = curr_name;
+            }
+
+            if (strncmp(snode_name, curr_name, hostname_size) != 0) {
+                fprintf(stderr, "PE %d on %s is a streaming node "
+                                "but not placed on %s\n", i, curr_name, 
+                                 snode_name);
+                errors++;
+            }
+        } else {
+            if (tnode_name == NULL) {
+                tnode_name = curr_name;
+            }
+
+            if (strncmp(tnode_name, curr_name, hostname_size) != 0) {
+                fprintf(stderr, "PE %d on %s is a target node "
+                                "but not placed on %s\n", i, curr_name, 
+                                 tnode_name);
+                errors++;
+            }
+        }
+    }
+
+    if (snode_name == NULL || tnode_name == NULL) {
+        fprintf(stderr, "Error: no streaming or target node\n");
+        return -1;
+    }
+
+    if (strncmp(snode_name, tnode_name, hostname_size) == 0) {
+        fprintf(stderr, "Warning: senders and receivers are running on the "
+                        "same node %s\n", snode_name);
+    }
+
+    shmem_free(dest);
+    shmem_free(hostname);
+
+    return errors;
+}
+
+static
+int error_checking_init_target_usage(perf_metrics_t * const metric_info) {
+    int error = false;
+    assert(metric_info->midpt > 0);
+
+    if (metric_info->sztarget != -1 && metric_info->szinitiator == -1) {
+        if (metric_info->sztarget < 1 ||
+            metric_info->sztarget > metric_info->midpt ||
+            !metric_info->target_data) {
+            error = true;
+        } else {
+            metric_info->szinitiator = metric_info->midpt;
+        }
+    } else if (metric_info->sztarget == -1 && metric_info->szinitiator != -1) {
+        if( metric_info->szinitiator < 1 ||
+            metric_info->szinitiator > metric_info->midpt ||
+            !metric_info->target_data) {
+            error = true;
+        } else {
+            metric_info->sztarget = metric_info->midpt;
+        }
+    } else if (metric_info->sztarget == -1 && metric_info->szinitiator == -1) {
+        metric_info->szinitiator = metric_info->midpt;
+        metric_info->sztarget = metric_info->midpt;
+    } else {
+        if (!metric_info->target_data) {
+            error = true;
+        }
+    }
+
+    if (error) {
+        fprintf(stderr, "Invalid usage of command line arg -r/-l\n");
+        return -1;
+    }
+    return 0;
+}
+
+static inline
+void large_message_metric_chg(perf_metrics_t * const metric_info, int len) {
+    if(len > LARGE_MESSAGE_SIZE) {
+        metric_info->window_size = WINDOW_SIZE_LARGE;
+        metric_info->trials = TRIALS_LARGE;
+        metric_info->warmup = WARMUP_LARGE;
+    }
+}
+
+/* put/get bw use opposite streaming/validate nodes */
+static inline
+red_PE_set validation_set(perf_metrics_t * const my_info, int *nPEs)
+{
+    if(my_info->cstyle == COMM_PAIRWISE) {
+        if(streaming_node(my_info)) {
+            *nPEs = my_info->szinitiator;
+            return FIRST_HALF;
+        } else if(target_node(my_info)) {
+            *nPEs = my_info->sztarget;
+            return SECOND_HALF;
+        } else {
+            fprintf(stderr, "Warning: you are getting data from a node that "
+                "wasn't a part of the perf set \n ");
+            return 0;
+        }
+    } else {
+        assert(my_info->cstyle == COMM_INCAST);
+        *nPEs = my_info->num_pes;
+        return FULL_SET;
+    }
+}
+
+/* reduction to collect performance results from PE set
+ * then start_pe will print results --- assumes num_pes is even */
+static inline
+void PE_set_used_adjustments(int *nPEs, int *stride, int *start_pe,
+                             perf_metrics_t * const my_info) {
+    red_PE_set PE_set = validation_set(my_info, nPEs);
+
+    if(PE_set == FIRST_HALF || PE_set == FULL_SET) {
+        *start_pe = 0;
+    }
+    else {
+        assert(PE_set == SECOND_HALF);
+        *start_pe = my_info->midpt;
+    }
+
+    *stride = 0; /* back to back PEs */
+}
+
+static
+void print_header(perf_metrics_t * const metric_info) {
+    printf("\n%20sSandia OpenSHMEM Performance Suite%20s\n", " ", " ");
+    printf("%20s==================================%20s\n", " ", " ");
+    printf("Total Number of PEs:    %10d%6sWindow size:            %10lu\n", 
+            metric_info->num_pes, " ", metric_info->window_size);
+    printf("Number of source PEs:   %10d%6sMaximum message size:   %10lu\n", 
+            metric_info->szinitiator, " ", metric_info->max_len);
+    printf("Number of target PEs:   %10d%6sNumber of threads:      %10d\n", 
+            metric_info->sztarget, " ", metric_info->nthreads);
+    printf("Iteration count:        %10lu%6s", metric_info->trials, " ");
+#if defined(ENABLE_THREADS)
+    printf("Thread safety:          %10s\n", thread_safety_str(metric_info));
+#endif
+    printf("\n");
 }
diff --git a/test/performance/shmem_perf_suite/int_element_latency.h b/test/performance/shmem_perf_suite/int_element_latency.h
index 45e857e..a32cab7 100644
--- a/test/performance/shmem_perf_suite/int_element_latency.h
+++ b/test/performance/shmem_perf_suite/int_element_latency.h
@@ -25,69 +25,101 @@
  * SOFTWARE.
  */
 
-void static inline
-int_p_latency(perf_metrics_t data)
+static inline
+void int_p_latency(perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
     unsigned int i = 0;
+    int dest = partner_node(metric_info);
+    int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
 
-    if (data.my_node == PUT_IO_NODE) {
-        printf("\nStream shmem_int_p results:\n");
-        print_results_header();
+    if (metric_info->my_node == 0) {
+        printf("\nshmem_int_p results:\n");
+        print_latency_header();
     }
+    shmem_barrier_all();
 
-    /*puts to zero to match gets validation scheme*/
-    if (data.my_node == PUT_IO_NODE) {
+    /* puts to zero to match gets validation scheme */
+    if (sender) {
 
-        for (i = 0; i < data.trials + data.warmup; i++) {
-            if(i == data.warmup)
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
 
-            shmem_int_p((int*) data.dest, data.my_node, 0);
+            shmem_int_p((int*) metric_info->dest, metric_info->my_node, dest);
             shmem_quiet();
 
         }
         end = perf_shmemx_wtime();
 
-        calc_and_print_results(start, end, sizeof(int), data);
+        calc_and_print_results(start, end, sizeof(int), metric_info);
     }
 
     shmem_barrier_all();
 
-    if((data.my_node == 0) && data.validate)
-        validate_recv(data.dest, sizeof(int), partner_node(data.my_node));
+    if(!sender && metric_info->validate)
+        validate_recv(metric_info->dest, sizeof(int), dest);
 
 } /* latency/bw for one-way trip */
 
-void static inline
-int_g_latency(perf_metrics_t data)
+static inline
+void int_g_latency(perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
     unsigned int i = 0;
     int rtnd = -1;
+    int dest = partner_node(metric_info);
+    int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
 
-    if (data.my_node == GET_IO_NODE) {
-        printf("\nStream shmem_int_g results:\n");
-        print_results_header();
+    if (metric_info->my_node == 0) {
+        printf("\nshmem_int_g results:\n");
+        print_latency_header();
     }
+    shmem_barrier_all();
 
-    if (data.my_node == GET_IO_NODE) {
+    if (receiver) {
 
-        for (i = 0; i < data.trials + data.warmup; i++) {
-            if(i == data.warmup)
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
 
-            rtnd = shmem_int_g((int*) data.src, 1);
+            rtnd = shmem_int_g((int*) metric_info->src, dest);
         }
         end = perf_shmemx_wtime();
 
-        calc_and_print_results(start, end, sizeof(int), data);
+        calc_and_print_results(start, end, sizeof(int), metric_info);
     }
 
     shmem_barrier_all();
 
-    if((data.my_node == 0) && data.validate)
-        validate_recv((char*) &rtnd, sizeof(int), partner_node(data.my_node));
+    if(receiver && metric_info->validate)
+        validate_recv((char*) &rtnd, sizeof(int), dest);
 }
diff --git a/test/performance/shmem_perf_suite/latency_common.h b/test/performance/shmem_perf_suite/latency_common.h
index 482b47c..3ba7b78 100644
--- a/test/performance/shmem_perf_suite/latency_common.h
+++ b/test/performance/shmem_perf_suite/latency_common.h
@@ -26,116 +26,64 @@
  */
 
 #include <common.h>
+#ifdef ENABLE_OPENMP
+#include <omp.h>
+#endif
 
-#define PUT_IO_NODE 1
-#define GET_IO_NODE !PUT_IO_NODE
 #define INIT_VALUE 1
 
-#define MAX_MSG_SIZE (1<<23)
-#define START_LEN 1
-
-#define INC 2
-#define TRIALS 100
-#define WARMUP 10
-
-typedef struct perf_metrics {
-   unsigned int start_len, max_len;
-   unsigned int inc, trials;
-   unsigned int warmup;
-   int validate;
-   int my_node, npes;
-   long * target;
-   char * src, *dest;
-} perf_metrics_t;
-
-void static data_init(perf_metrics_t * data) {
-   data->start_len = START_LEN;
-   data->max_len = MAX_MSG_SIZE;
-   data->inc = INC;
-   data->trials = TRIALS;
-   data->warmup = WARMUP; /*number of initial iterations to skip*/
-   data->validate = false;
-   data->my_node = shmem_my_pe();
-   data->npes = shmem_n_pes();
-   data->target = NULL;
-   data->src = NULL;
-   data->dest = NULL;
-}
-
-void static inline print_results_header(void) {
-   printf("\nLength                  Latency                       \n");
-   printf("in bytes            in micro seconds              \n");
-}
-
-/*not storing results, only outputing it*/
-void static inline calc_and_print_results(double start, double end, int len,
-                                         perf_metrics_t data) {
-    double latency = 0.0;
-    latency = (end - start) / data.trials;
-
-    printf("%9d           %8.2f             \n", len, latency);
+static 
+void init_metrics(perf_metrics_t * const metric_info) {
+    metric_info->t_type = LAT;
+    set_metric_defaults(metric_info);
+    metric_info->target = NULL;
+    metric_info->cstyle = COMM_PAIRWISE;
+    metric_info->opstyle = STYLE_RMA;
 }
 
-int static inline partner_node(int my_node)
-{
-    return ((my_node % 2 == 0) ? (my_node + 1) : (my_node - 1));
+static inline 
+void print_latency_header(void) {
+    printf("\nMessage Size%15sLatency\n", " ");
+    printf("%4sin bytes%17sin us\n", " ", " ");
 }
 
-void static inline command_line_arg_check(int argc, char *argv[],
-                            perf_metrics_t *metric_info) {
-    int ch, error = false;
-    extern char *optarg;
-
-    /* check command line args */
-    while ((ch = getopt(argc, argv, "e:s:n:v")) != EOF) {
-        switch (ch) {
-        case 's':
-            metric_info->start_len = strtol(optarg, (char **)NULL, 0);
-            if ( metric_info->start_len < 1 ) metric_info->start_len = 1;
-            if(!is_pow_of_2(metric_info->start_len)) error = true;
-            break;
-        case 'e':
-            metric_info->max_len = strtol(optarg, (char **)NULL, 0);
-            if(!is_pow_of_2(metric_info->max_len)) error = true;
-            if(metric_info->max_len < metric_info->start_len) error = true;
-            break;
-        case 'n':
-            metric_info->trials = strtol(optarg, (char **)NULL, 0);
-            if(metric_info->trials <= (metric_info->warmup*2)) error = true;
-            break;
-        case 'v':
-            metric_info->validate = true;
-            break;
-        default:
-            error = true;
-            break;
-        }
+/* calculation and printing of the latency */
+static inline 
+void calc_and_print_results(double start, double end, int len,
+                            perf_metrics_t * const metric_info) {
+    int stride = 0, start_pe = 0, nPEs = 0;
+    int nred_elements = 1;
+    static double latency = 0.0, avg_latency = 0.0;
+    static double pwrk[SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+    
+    PE_set_used_adjustments(&nPEs, &stride, &start_pe, metric_info);
+
+    if (end > 0 && start > 0 && (end - start) > 0) {
+        latency = (end - start) / metric_info->trials;
+    } else {
+        fprintf(stderr, "Incorrect time measured from latency test: "
+                        "start = %lf, end = %lf\n", start, end);
     }
 
-    if (error) {
-        if (metric_info->my_node == 0) {
-            fprintf(stderr, "Usage: [-s start_length] [-e end_length] "\
-                    ": lengths must be a power of two \n " \
-                    "[-n trials (must be greater than 20)] "\
-                    "[-v (validate results)]\n");
-        }
-#ifndef VERSION_1_0
-        shmem_finalize();
-#endif
-        exit (-1);
+    if (metric_info->individual_report == 1) {
+        printf("Individual latency for PE %6d is %10.2f\n",
+                metric_info->my_node, latency);
+    }
+    shmem_barrier(start_pe, stride, nPEs, bar_psync);
+
+    if (nPEs >= 2) {
+        shmem_double_sum_to_all(&avg_latency, &latency, 
+                                nred_elements, start_pe, stride,
+                                nPEs, pwrk, red_psync);
+        avg_latency /= nPEs;
+    } else {
+        avg_latency = latency;
     }
-}
 
-void static inline only_two_PEs_check(int my_node, int num_pes) {
-    if (num_pes != 2) {
-        if (my_node == 0) {
-            fprintf(stderr, "2-nodes only test\n");
-        }
-#ifndef VERSION_1_0
-        shmem_finalize();
-#endif
-        exit(77);
+    if (metric_info->my_node == start_pe) {
+        printf("%2s%10d%12s%10.2f\n", " ", len, " ", avg_latency);
     }
+
 }
 
 /**************************************************************/
@@ -144,31 +92,39 @@ void static inline only_two_PEs_check(int my_node, int num_pes) {
 
 /*have single symmetric long element "target" from perf_metrics_t
  *  that needs to be initialized in function*/
-extern void long_element_round_trip_latency(perf_metrics_t data);
+extern void long_element_round_trip_latency(perf_metrics_t *data);
 
-extern void int_element_latency(perf_metrics_t data);
+extern void int_element_latency(perf_metrics_t *data);
 
 /*have symmetric buffers src/dest from perf_metrics_t
  *  that has been initialized to my_node number */
 extern void streaming_latency(int len, perf_metrics_t *data);
 
-void static inline  multi_size_latency(perf_metrics_t data, char *argv[]) {
+static inline 
+void multi_size_latency(perf_metrics_t * const data, char *argv[]) {
     unsigned int len;
-    int partner_pe = partner_node(data.my_node);
+    int partner_pe = partner_node(data);
 
-    for (len = data.start_len; len <= data.max_len; len *= data.inc) {
-
-        shmem_barrier_all();
-
-        streaming_latency(len, &data);
+    if (data->my_node == 0) {
+        print_latency_header();
+    }
 
-        shmem_barrier_all();
+    for (len = data->start_len; len <= data->max_len; len *= data->size_inc) {
+        large_message_metric_chg(data, len);
+        streaming_latency(len, data);
     }
 
     shmem_barrier_all();
 
-    if((data.my_node == 0) && data.validate)
-        validate_recv(data.dest, data.max_len, partner_pe);
+    if (data->validate) {
+        int errors = -1;
+        if ((streaming_node(data) && data->opstyle == STYLE_GET) ||
+            (target_node(data) && data->opstyle == STYLE_PUT))
+            errors = validate_recv(data->dest, data->max_len, partner_pe);
+        
+        if (errors >= 0)
+            printf("Validation complete (%d errors)\n", errors);
+    }
 }
 
 
@@ -177,58 +133,124 @@ void static inline  multi_size_latency(perf_metrics_t data, char *argv[]) {
 /*                   INIT and teardown of resources           */
 /**************************************************************/
 
-void static inline latency_init_resources(int argc, char *argv[],
-                                          perf_metrics_t *data) {
+static inline 
+int latency_init_resources(int argc, char *argv[],
+                           perf_metrics_t * const metric_info) {
+    init_metrics(metric_info);
+    int ret = command_line_arg_check(argc, argv, metric_info);
+
 #ifndef VERSION_1_0
+#if defined(ENABLE_THREADS)
+    int tl;
+    shmem_init_thread(metric_info->thread_safety, &tl);
+    if(tl != metric_info->thread_safety) {
+        fprintf(stderr,"Could not initialize with requested thread "
+                "level %d: got %d\n", metric_info->thread_safety, tl);
+        return -1;
+    }
+#else
     shmem_init();
+#endif
 #else
     start_pes(0);
 #endif
 
-    data_init(data);
+    update_metrics(metric_info);
 
-    only_two_PEs_check(data->my_node, data->npes);
+    if (ret) {
+        if (metric_info->my_node == 0) {
+            print_usage(ret);
+        }
+        return -1;
+    } else {
+        if (metric_info->num_pes < 2) {
+            fprintf(stderr, "This test requires at least two processes.\n");
+            print_usage(1);
+            return -1;
+        }
+    }
 
-    command_line_arg_check(argc, argv, data);
+    if (error_checking_init_target_usage(metric_info) == -1)
+        return -1;
+#if defined(ENABLE_THREADS)
+    thread_safety_validation_check(metric_info);
+#endif
+    init_psync_arrays();
 
-    data->src = aligned_buffer_alloc(data->max_len);
-    init_array(data->src, data->max_len, data->my_node);
+    if(only_even_PEs_check(metric_info->my_node, metric_info->num_pes) != 0) {
+        return -1;
+    }
 
-    data->dest = aligned_buffer_alloc(data->max_len);
-    init_array(data->dest, data->max_len, data->my_node);
+    metric_info->src = aligned_buffer_alloc(metric_info->max_len);
+    init_array(metric_info->src, metric_info->max_len, metric_info->my_node);
+
+    metric_info->dest = aligned_buffer_alloc(metric_info->max_len);
+    init_array(metric_info->dest, metric_info->max_len, metric_info->my_node);
 
 #ifndef VERSION_1_0
-    data->target = shmem_malloc(sizeof(long));
+    metric_info->target = shmem_malloc(sizeof(long));
 #else
-    data->target = shmalloc(sizeof(long));
+    metric_info->target = shmalloc(sizeof(long));
 #endif
+
+    return 0;
 }
 
-void static inline latency_free_resources(perf_metrics_t *data) {
+static inline 
+void latency_free_resources(const perf_metrics_t * const metric_info) {
     shmem_barrier_all();
 
 #ifndef VERSION_1_0
-    shmem_free(data->target);
+    shmem_free(metric_info->target);
 #else
-    shfree(data->target);
+    shfree(metric_info->target);
 #endif
-    aligned_buffer_free(data->src);
-    aligned_buffer_free(data->dest);
+    aligned_buffer_free(metric_info->src);
+    aligned_buffer_free(metric_info->dest);
+}
+
+static inline 
+void latency_finalize(void) {
 #ifndef VERSION_1_0
     shmem_finalize();
 #endif
 }
 
-void static inline latency_main(int argc, char *argv[]) {
-    perf_metrics_t data;
+static inline 
+void latency_main(int argc, char *argv[], op_style opstyle) {
+    perf_metrics_t metric_info;
+
+    int ret = latency_init_resources(argc, argv, &metric_info);
+    metric_info.opstyle = opstyle;
+
+    if (ret == 0) {
+        if (metric_info.my_node == 0) {
+            print_header(&metric_info);
+        }
+        long_element_round_trip_latency(&metric_info);
+        int_element_latency(&metric_info);
+        multi_size_latency(&metric_info, argv);
+        latency_free_resources(&metric_info);
+    }
 
-    latency_init_resources(argc, argv, &data);
+    latency_finalize();
+}
 
-    long_element_round_trip_latency(data);
+static inline
+void latency_main_ctx(int argc, char *argv[], op_style opstyle) {
+    perf_metrics_t metric_info;
 
-    int_element_latency(data);
+    int ret = latency_init_resources(argc, argv, &metric_info);
+    metric_info.opstyle = opstyle;
 
-    multi_size_latency(data, argv);
+    if (ret == 0) {
+        if (metric_info.my_node == 0) {
+            print_header(&metric_info);
+        }
+        multi_size_latency(&metric_info, argv);
+        latency_free_resources(&metric_info);
+    }
 
-    latency_free_resources(&data);
+    latency_finalize();
 }
+
diff --git a/test/performance/shmem_perf_suite/latency_ctx.h b/test/performance/shmem_perf_suite/latency_ctx.h
new file mode 100644
index 0000000..115e9d2
--- /dev/null
+++ b/test/performance/shmem_perf_suite/latency_ctx.h
@@ -0,0 +1,190 @@
+/*
+*  Copyright (c) 2018 Intel Corporation. All rights reserved.
+*  This software is available to you under the BSD license below:
+*
+*      Redistribution and use in source and binary forms, with or
+*      without modification, are permitted provided that the following
+*      conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+static inline 
+void streaming_put_latency_ctx(int len, perf_metrics_t *metric_info, int streaming_node)
+{
+    double start = 0.0, end = 0.0;
+    unsigned long int i;
+    int dest = partner_node(metric_info);
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same "
+                            "process (%d)\n", dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
+
+    shmem_barrier_all();
+
+    if (streaming_node) {
+#pragma omp parallel default(none) firstprivate(len, dest) private(i) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
+        {
+            const int thread_id = omp_get_thread_num();
+            shmem_ctx_t ctx;
+            shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
+
+            for (i = 0; i < metric_info->warmup; i++) {
+#ifdef USE_NONBLOCKING_API
+                shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
+#else
+                shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                 metric_info->src + thread_id * len, len, dest);
+#endif
+                shmem_ctx_quiet(ctx);
+            }
+            shmem_ctx_destroy(ctx);
+        }
+    }
+
+    shmem_barrier_all();
+    if (streaming_node) {
+#pragma omp parallel default(none) firstprivate(len, dest) private(i) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
+        {
+            const int thread_id = omp_get_thread_num();
+            shmem_ctx_t ctx;
+            shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
+
+#pragma omp barrier
+#pragma omp master
+            {
+                start = perf_shmemx_wtime();
+            }
+
+            for (i = 0; i < metric_info->trials; i++) {
+#ifdef USE_NONBLOCKING_API
+                shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
+#else
+                shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                 metric_info->src + thread_id * len, len, dest);
+#endif
+                shmem_ctx_quiet(ctx);
+            }
+            shmem_ctx_destroy(ctx);
+        }
+    }
+
+    shmem_barrier_all();
+    if (streaming_node) {
+        end = perf_shmemx_wtime();
+        calc_and_print_results(start, end, len, metric_info);
+    }
+
+    shmem_barrier_all();
+}
+
+static inline
+void streaming_get_latency_ctx(int len, perf_metrics_t *metric_info, int streaming_node)
+{
+    double start = 0.0, end = 0.0;
+    unsigned long int i;
+    int dest = partner_node(metric_info);
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same "
+                            "process (%d)\n", dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
+
+    shmem_barrier_all();
+
+    if (streaming_node) {
+#pragma omp parallel default(none) firstprivate(len, dest) private(i) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
+        {
+            const int thread_id = omp_get_thread_num();
+            shmem_ctx_t ctx;
+            shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
+
+            for (i = 0; i < metric_info->warmup; i++) {
+#ifdef USE_NONBLOCKING_API
+                shmem_ctx_getmem_nbi(ctx, metric_info->dest + thread_id * len,
+                                     metric_info->src + thread_id * len, len, dest);
+                shmem_ctx_quiet(ctx);
+#else
+                shmem_ctx_getmem(ctx, metric_info->dest + thread_id * len,
+                                 metric_info->src + thread_id * len, len, dest);
+#endif
+            }
+            shmem_ctx_destroy(ctx);
+        }
+    }
+
+    shmem_barrier_all();
+    if (streaming_node) {
+#pragma omp parallel default(none) firstprivate(len, dest) private(i) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
+        {
+            const int thread_id = omp_get_thread_num();
+            shmem_ctx_t ctx;
+            shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
+
+#pragma omp barrier
+#pragma omp master
+            {
+                start = perf_shmemx_wtime();
+            }
+
+            for (i = 0; i < metric_info->trials; i++) {
+#ifdef USE_NONBLOCKING_API
+                shmem_ctx_getmem_nbi(ctx, metric_info->dest + thread_id * len,
+                                     metric_info->src + thread_id * len, len, dest);
+                shmem_ctx_quiet(ctx);
+#else
+                shmem_ctx_getmem(ctx, metric_info->dest + thread_id * len,
+                                 metric_info->src + thread_id * len, len, dest);
+#endif
+            }
+            shmem_ctx_destroy(ctx);
+        }
+    }
+
+    shmem_barrier_all();
+    if (streaming_node) {
+        end = perf_shmemx_wtime();
+        calc_and_print_results(start, end, len, metric_info);
+    }
+
+    shmem_barrier_all();
+}
diff --git a/test/performance/shmem_perf_suite/round_t_latency.h b/test/performance/shmem_perf_suite/round_t_latency.h
index 6b3d86e..4910e2d 100644
--- a/test/performance/shmem_perf_suite/round_t_latency.h
+++ b/test/performance/shmem_perf_suite/round_t_latency.h
@@ -25,79 +25,103 @@
  * SOFTWARE.
  */
 
-void static inline
-long_element_round_trip_latency_get(perf_metrics_t data)
+static inline
+void long_element_round_trip_latency_get(perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
-    int dest = 1;
-    int partner_pe = partner_node(data.my_node);
-    *data.target = data.my_node;
+    int dest = partner_node(metric_info);
+    int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
+    *metric_info->target = metric_info->my_node;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
 
-    if (data.my_node == GET_IO_NODE) {
-        printf("\nshmem_long_g results:\n");
-        print_results_header();
+    if (metric_info->my_node == 0) {
+        printf("shmem_long_g results:\n");
+        print_latency_header();
     }
 
     shmem_barrier_all();
 
-    if (data.my_node == GET_IO_NODE) {
+    if (receiver) {
         unsigned int i;
-        for (i = 0; i < data.trials + data.warmup; i++) {
-            if(i == data.warmup)
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
 
-            *data.target = shmem_long_g(data.target, dest);
+            *metric_info->target = shmem_long_g(metric_info->target, dest);
         }
         end = perf_shmemx_wtime();
 
-        calc_and_print_results(start, end, sizeof(long), data);
+        calc_and_print_results(start, end, sizeof(long), metric_info);
 
-        if(data.validate) {
-            if(*data.target != partner_pe)
+        if(metric_info->validate) {
+            if(*metric_info->target != dest)
                 printf("validation error shmem_long_g target = %ld != %d\n",
-                        *data.target, partner_pe);
+                        *metric_info->target, dest);
         }
     }
 } /*gauge small get pathway round trip latency*/
 
-void static inline
-long_element_round_trip_latency_put(perf_metrics_t data)
+static inline
+void long_element_round_trip_latency_put(perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
     long tmp;
-    int dest = (data.my_node + 1) % data.npes;
+    int dest = partner_node(metric_info);
+    int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
     unsigned int i;
-    tmp = *data.target = INIT_VALUE;
+    tmp = *metric_info->target = INIT_VALUE;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
+    }
 
-    if (data.my_node == PUT_IO_NODE) {
-        printf("\nPing-Pong shmem_long_p results:\n");
-        print_results_header();
+    if (metric_info->my_node == 0) {
+        printf("Ping-Pong shmem_long_p results:\n");
+        print_latency_header();
     }
 
     shmem_barrier_all();
 
-    if (data.my_node == PUT_IO_NODE) {
-        for (i = 0; i < data.trials + data.warmup; i++) {
-            if(i == data.warmup)
+    if (sender) {
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
 
-            shmem_long_p(data.target, ++tmp, dest);
-
-            shmem_long_wait_until(data.target, SHMEM_CMP_EQ, tmp);
+            shmem_long_p(metric_info->target, ++tmp, dest);
+            shmem_long_wait_until(metric_info->target, SHMEM_CMP_EQ, tmp);
         }
         end = perf_shmemx_wtime();
-
-        data.trials = data.trials*2; /*output half to get single round trip time*/
-        calc_and_print_results(start, end, sizeof(long), data);
+        metric_info->trials = metric_info->trials * 2; /*output half to get single round trip time*/
+        calc_and_print_results(start, end, sizeof(long), metric_info);
 
    } else {
-        for (i = 0; i < data.trials + data.warmup; i++) {
-            shmem_long_wait_until(data.target, SHMEM_CMP_EQ, ++tmp);
-
-            shmem_long_p(data.target, tmp, dest);
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            shmem_long_wait_until(metric_info->target, SHMEM_CMP_EQ, ++tmp);
+            shmem_long_p(metric_info->target, tmp, dest);
         }
    }
 
-} /*gauge small put pathway round trip latency*/
+} /* gauge small put pathway round trip latency */
diff --git a/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c
index f287646..05cebd8 100644
--- a/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bibw_atomics_perf.c
@@ -27,13 +27,12 @@
 
 /*
 **
-**  This is a bandwidth centric test for put: back-to-back message rate
+**  This is a bandwidth centric test for atomic operations
 **
 **  Features of Test: bi-directional bandwidth
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #include <bw_common.h>
@@ -42,14 +41,85 @@
     do {                                                                       \
         double start = 0.0, end = 0.0;                                         \
         unsigned long int i = 0, j = 0, num_itr = metric_info->trials + metric_info->warmup; \
-        int dest = partner_node(*metric_info);                                 \
+        int dest = partner_node(metric_info);                                 \
         shmem_barrier_all();                                                   \
                                                                                \
         switch(op) {                                                       \
+            case OP_SET:                                                   \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_set(                         \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                           \
+                    shmem_quiet();                                         \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+             case OP_AND:                                                   \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_and(                         \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                           \
+                    shmem_quiet();                                         \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+            case OP_OR:                                                    \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_or(                          \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                           \
+                    shmem_quiet();                                         \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+            case OP_XOR:                                                   \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_xor(                         \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                           \
+                    shmem_quiet();                                         \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
             case OP_ADD:                                                   \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_add(                         \
@@ -62,8 +132,11 @@
             break;                                                         \
             case OP_INC:                                                   \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_inc(                         \
@@ -74,10 +147,73 @@
                 if(snode)                                                  \
                     end = perf_shmemx_wtime();                             \
             break;                                                         \
+            case OP_FETCH:                                                 \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_fetch(                       \
+                            (TYPE *)(metric_info->dest), dest);            \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+            case OP_FAND:                                                  \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_fetch_and(                   \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+            case OP_FOR:                                                   \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_fetch_or(                    \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
+            case OP_FXOR:                                                  \
+                for(i = 0; i < num_itr; i++) {                             \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
+                                                                           \
+                    for(j = 0; j < metric_info->window_size; j++)          \
+                        shmem_##NAME##_atomic_fetch_xor(                   \
+                            (TYPE *)(metric_info->dest), ONE, dest);       \
+                }                                                          \
+                if(snode)                                                  \
+                    end = perf_shmemx_wtime();                             \
+            break;                                                         \
             case OP_FADD:                                                  \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_fetch_add(                   \
@@ -88,8 +224,11 @@
             break;                                                         \
             case OP_FINC:                                                  \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_fetch_inc(                   \
@@ -100,8 +239,11 @@
             break;                                                         \
             case OP_SWAP:                                                  \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_swap(                        \
@@ -112,8 +254,11 @@
             break;                                                         \
             case OP_CSWAP:                                                 \
                 for(i = 0; i < num_itr; i++) {                             \
-                    if(snode && i == metric_info->warmup)                  \
-                        start = perf_shmemx_wtime();                       \
+                    if (i == metric_info->warmup) {                        \
+                        shmem_barrier_all();                               \
+                        if (snode)                                         \
+                            start = perf_shmemx_wtime();                   \
+                    }                                                      \
                                                                            \
                     for(j = 0; j < metric_info->window_size; j++)          \
                         shmem_##NAME##_atomic_compare_swap(                \
@@ -128,69 +273,62 @@
                 break;                                                     \
         }                                                                  \
             if(snode)                                                      \
-                calc_and_print_results(end, start, len, *metric_info);  \
+                calc_and_print_results(end, start, len, metric_info);     \
     } while(0)
 
 
 #define NUM_INC 100
 
-typedef enum {
-    OP_ADD,
-    OP_INC,
-    OP_FADD,
-    OP_FINC,
-    OP_SWAP,
-    OP_CSWAP,
-    SIZE_OF_OP
-} atomic_op_type;
+static const char * atomic_op_names [] = { "fetch", "set", "cswap", "swap", "finc", "inc",
+                                           "fadd", "add", "fand", "and", "for", "or",
+                                           "fxor", "xor" };
 
-static const char * op_names [] = { "add", "inc", "fadd", "finc", "swap", "cswap" };
 
-static inline void bw_set_metric_info_len(perf_metrics_t *metric_info)
+static inline void bw_set_metric_info_len(perf_metrics_t * const metric_info)
 {
-    unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(int), sizeof(long),
-                                        sizeof(long long)};
-    int snode = streaming_node(*metric_info);
-    atomic_op_type op_type = OP_ADD;
-    metric_info->type = BI_DIR;
-    metric_info->bwstyle = STYLE_ATOMIC;
-
-    for(op_type = OP_ADD; op_type < SIZE_OF_OP; op_type++) {
-        if(metric_info->my_node == 0 && op_type != OP_ADD)
-            printf("\nshmem_%s\n", op_names[op_type]);
+    unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(unsigned int), sizeof(unsigned long),
+                                        sizeof(unsigned long long)};
+    metric_info->b_type = BI_DIR;
+    int snode = streaming_node(metric_info);
+    atomic_op_type op_type = OP_FETCH;
 
+    for(op_type = OP_FETCH; op_type < SIZE_OF_OP; op_type++) {
+        if(metric_info->my_node == 0) {
+            printf("\nshmem_%s\n", atomic_op_names[op_type]);
+            printf("-----------\n");
+        }
         metric_info->start_len = atomic_sizes[0];
         metric_info->max_len = atomic_sizes[0];
         metric_info->size_inc = NUM_INC;
 
         shmem_barrier_all();
 
-        bi_bw(atomic_sizes[0], metric_info, snode, int, int, op_type);
+        bi_bw(atomic_sizes[0], metric_info, snode, uint, unsigned int, op_type);
 
         metric_info->start_len = atomic_sizes[1];
         metric_info->max_len = atomic_sizes[1];
 
         shmem_barrier_all();
 
-        bi_bw(atomic_sizes[1], metric_info, snode, long, long, op_type);
+        bi_bw(atomic_sizes[1], metric_info, snode, ulong, unsigned long, op_type);
 
         metric_info->start_len = atomic_sizes[2];
         metric_info->max_len = atomic_sizes[2];
 
         shmem_barrier_all();
 
-        bi_bw(atomic_sizes[2], metric_info, snode, longlong, long long, op_type);
+        bi_bw(atomic_sizes[2], metric_info, snode, ulonglong, unsigned long long, op_type);
     }
 }
 
-void bi_dir_bw(int len, perf_metrics_t *metric_info)
+void bi_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     bw_set_metric_info_len(metric_info);
 }
 
 int main(int argc, char *argv[])
 {
-    bi_dir_bw_main(argc, argv);
+    bi_dir_bw_main(argc, argv, STYLE_ATOMIC);
 
     return 0;
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c
index df82042..bdf0f17 100644
--- a/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bibw_get_perf.c
@@ -33,7 +33,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #include <bw_common.h>
@@ -41,13 +40,13 @@
 
 int main(int argc, char *argv[])
 {
-    bi_dir_bw_main(argc,argv);
+    bi_dir_bw_main(argc, argv, STYLE_GET);
 
     return 0;
 }  /* end of main() */
 
 void
-bi_dir_bw(int len, perf_metrics_t *metric_info)
+bi_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     bi_bw_get(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c
index 4ddbed2..a6b6564 100644
--- a/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bibw_put_ctx_perf.c
@@ -33,7 +33,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #define ENABLE_OPENMP
@@ -43,13 +42,13 @@
 
 int main(int argc, char *argv[])
 {
-    bi_dir_bw_main(argc, argv);
+    bi_dir_bw_main(argc, argv, STYLE_PUT);
 
     return 0;
 }
 
 void
-bi_dir_bw(int len, perf_metrics_t *metric_info)
+bi_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     bi_bw_ctx(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c b/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c
index 7f5589a..279536a 100644
--- a/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bibw_put_perf.c
@@ -33,7 +33,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #include <bw_common.h>
@@ -41,13 +40,13 @@
 
 int main(int argc, char *argv[])
 {
-    bi_dir_bw_main(argc, argv);
+    bi_dir_bw_main(argc, argv, STYLE_PUT);
 
     return 0;
 }
 
 void
-bi_dir_bw(int len, perf_metrics_t *metric_info)
+bi_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     bi_bw_put(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c b/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c
index 79cd499..d48208b 100644
--- a/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bw_atomics_perf.c
@@ -27,135 +27,345 @@
 
 /*
 **
-**  This is a bandwidth centric test for put: back-to-back message rate
+**  This is a bandwidth centric test for atomic operations
 **
 **  Features of Test: uni-directional bandwidth
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 #include <bw_common.h>
 
 #define ATOMIC_COMM_STYLE COMM_INCAST
 
-#define uni_bw(len, metric_info, snode, NAME, TYPE, op)               \
+#define uni_bw(len, metric_info, snode, NAME, TYPE, op)                        \
     do {                                                                       \
         double start = 0.0, end = 0.0;                                         \
-        unsigned long int i = 0, j = 0, num_itr = metric_info->trials + metric_info->warmup; \
-        int dest = partner_node(*metric_info);                                 \
+        unsigned long int i = 0, j = 0;                                        \
+        int dest = partner_node(metric_info);                                 \
         shmem_barrier_all();                                                   \
                                                                                \
-        if(snode) {                                                   \
-            switch(op) {                                                       \
-                case OP_ADD:                                                   \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
+        switch(op) {                                                           \
+            case OP_SET:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_set(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
                                                                                \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
-                            shmem_##NAME##_atomic_add(                         \
+                            shmem_##NAME##_atomic_set(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_AND:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_and(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_and(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_OR:                                                        \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_or(                          \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_or(                          \
                                 (TYPE *)(metric_info->dest), ONE, dest);       \
                                                                                \
                         shmem_quiet();                                         \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_XOR:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_xor(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
                                                                                \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_xor(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                case OP_INC:                                                   \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
+                }                                                              \
+            break;                                                             \
+            case OP_ADD:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_add(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
                                                                                \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_add(                         \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                                                                               \
+                        shmem_quiet();                                         \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_INC:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
                             shmem_##NAME##_atomic_inc(                         \
                                 (TYPE *)(metric_info->dest), dest);            \
                                                                                \
                         shmem_quiet();                                         \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_inc(                         \
+                                (TYPE *)(metric_info->dest), dest);            \
                                                                                \
+                        shmem_quiet();                                         \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                case OP_FADD:                                                  \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
-                                                                               \
+                }                                                              \
+            break;                                                             \
+            case OP_FETCH:                                                     \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch(                       \
+                                (TYPE *)(metric_info->dest), dest);            \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch(                       \
+                                (TYPE *)(metric_info->dest), dest);            \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_FAND:                                                      \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_and(                   \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_and(                   \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_FOR:                                                       \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_or(                    \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_or(                    \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_FXOR:                                                      \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_xor(                   \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_xor(                   \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                    end = perf_shmemx_wtime();                                 \
+                }                                                              \
+            break;                                                             \
+            case OP_FADD:                                                      \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_add(                   \
+                                (TYPE *)(metric_info->dest), ONE, dest);       \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
                             shmem_##NAME##_atomic_fetch_add(                   \
                                 (TYPE *)(metric_info->dest), ONE, dest);       \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                case OP_FINC:                                                  \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
-                                                                               \
+                }                                                              \
+            break;                                                             \
+            case OP_FINC:                                                      \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_fetch_inc(                   \
+                                (TYPE *)(metric_info->dest), dest);            \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
                             shmem_##NAME##_atomic_fetch_inc(                   \
                                 (TYPE *)(metric_info->dest), dest);            \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                case OP_SWAP:                                                  \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
-                                                                               \
+                }                                                              \
+            break;                                                             \
+            case OP_SWAP:                                                      \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_swap(                        \
+                                (TYPE *)(metric_info->src), ONE, dest);        \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
                             shmem_##NAME##_atomic_swap(                        \
                                 (TYPE *)(metric_info->src), ONE, dest);        \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                case OP_CSWAP:                                                 \
-                    for(i = 0; i < num_itr; i++) {                             \
-                        if(i == metric_info->warmup)                           \
-                            start = perf_shmemx_wtime();                       \
-                                                                               \
+                }                                                              \
+            break;                                                             \
+            case OP_CSWAP:                                                     \
+                if(snode) {                                                    \
+                    for(i = 0; i < metric_info->warmup; i++) {                 \
+                        for(j = 0; j < metric_info->window_size; j++)          \
+                            shmem_##NAME##_atomic_compare_swap(                \
+                                (TYPE *)(metric_info->src), dest, ONE, dest);  \
+                    }                                                          \
+                }                                                              \
+                shmem_barrier_all();                                           \
+                if(snode) {                                                    \
+                    start = perf_shmemx_wtime();                               \
+                    for(i = 0; i < metric_info->trials; i++) {                 \
                         for(j = 0; j < metric_info->window_size; j++)          \
                             shmem_##NAME##_atomic_compare_swap(                \
                                 (TYPE *)(metric_info->src), dest, ONE, dest);  \
                     }                                                          \
                     end = perf_shmemx_wtime();                                 \
-                break;                                                         \
-                default:                                                       \
-                    fprintf(stderr, "Error %d not a valid op case              \
+                }                                                              \
+            break;                                                             \
+            default:                                                           \
+                fprintf(stderr, "Error %d not a valid op case                  \
                                                 for atomics\n", op);           \
-                break;                                                         \
-            }                                                                  \
-            calc_and_print_results(end, start, len, *metric_info);          \
+            break;                                                             \
+        }                                                                      \
+        if(snode) {                                                            \
+            calc_and_print_results(end, start, len, metric_info);             \
         }                                                                      \
     } while(0)
 
 #define NUM_INC 100
 
+static const char * atomic_op_names [] = { "fetch", "set", "cswap", "swap", "finc", "inc",
+                                           "fadd", "add", "fand", "and", "for", "or",
+                                           "fxor", "xor" };
 
-typedef enum {
-    OP_ADD,
-    OP_INC,
-    OP_FADD,
-    OP_FINC,
-    OP_SWAP,
-    OP_CSWAP,
-    SIZE_OF_OP
-} atomic_op_type;
-
-static const char * op_names [] = { "add", "inc", "fadd", "finc", "swap", "cswap" };
 
-static inline void bw_set_metric_info_len(perf_metrics_t *metric_info)
+static inline void bw_set_metric_info_len(perf_metrics_t * const metric_info)
 {
-    unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(int), sizeof(long),
-                                        sizeof(long long)};
-    metric_info->cstyle = ATOMIC_COMM_STYLE;
-    metric_info->type = UNI_DIR;
-    int snode = streaming_node(*metric_info);
-    atomic_op_type op_type = OP_ADD;
+    unsigned int atomic_sizes[ATOMICS_N_DTs] = {sizeof(unsigned int), sizeof(unsigned long),
+                                        sizeof(unsigned long long)};
+    metric_info->b_type = UNI_DIR;
+    int snode = streaming_node(metric_info);
+    atomic_op_type op_type = OP_FETCH;
 
-    for(op_type = OP_ADD; op_type < SIZE_OF_OP; op_type++) {
-        if(metric_info->my_node == 0) 
-            printf("\nshmem_%s\n", op_names[op_type]);
+    for(op_type = OP_FETCH; op_type < SIZE_OF_OP; op_type++) {
+        if(metric_info->my_node == 0) { 
+            printf("\nshmem_%s\n", atomic_op_names[op_type]);
+            printf("-----------\n");
+        }
 
         metric_info->start_len = atomic_sizes[0];
         metric_info->max_len = atomic_sizes[0];
@@ -163,25 +373,25 @@ static inline void bw_set_metric_info_len(perf_metrics_t *metric_info)
 
         shmem_barrier_all();
 
-        uni_bw(atomic_sizes[0], metric_info, snode, int, int, op_type);
+        uni_bw(atomic_sizes[0], metric_info, snode, uint, unsigned int, op_type);
 
         metric_info->start_len = atomic_sizes[1];
         metric_info->max_len = atomic_sizes[1];
 
         shmem_barrier_all();
 
-        uni_bw(atomic_sizes[1], metric_info, snode, long, long, op_type);
+        uni_bw(atomic_sizes[1], metric_info, snode, ulong, unsigned long, op_type);
 
         metric_info->start_len = atomic_sizes[2];
         metric_info->max_len = atomic_sizes[2];
 
         shmem_barrier_all();
 
-        uni_bw(atomic_sizes[2], metric_info, snode, longlong, long long, op_type);
+        uni_bw(atomic_sizes[2], metric_info, snode, ulonglong, unsigned long long, op_type);
     }
 }
 
-void uni_dir_bw(int len, perf_metrics_t *metric_info)
+void uni_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     bw_set_metric_info_len(metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bw_get_perf.c b/test/performance/shmem_perf_suite/shmem_bw_get_perf.c
index 9c4d832..105558b 100644
--- a/test/performance/shmem_perf_suite/shmem_bw_get_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bw_get_perf.c
@@ -33,7 +33,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #include <bw_common.h>
@@ -47,7 +46,7 @@ int main(int argc, char *argv[])
 }  /* end of main() */
 
 void
-uni_dir_bw(int len, perf_metrics_t *metric_info)
+uni_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     uni_bw_get(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c
index c4faaee..598da06 100644
--- a/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bw_put_ctx_perf.c
@@ -31,7 +31,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 
 #define ENABLE_OPENMP
@@ -47,7 +46,7 @@ int main(int argc, char *argv[])
 }
 
 void
-uni_dir_bw(int len, perf_metrics_t *metric_info)
+uni_dir_bw(int len, perf_metrics_t * const metric_info)
 {
-    uni_bw_ctx(len, metric_info, !streaming_node(*metric_info));
+    uni_bw_ctx(len, metric_info, streaming_node(metric_info));
 }
diff --git a/test/performance/shmem_perf_suite/shmem_bw_put_perf.c b/test/performance/shmem_perf_suite/shmem_bw_put_perf.c
index 3cbab8f..ca51571 100644
--- a/test/performance/shmem_perf_suite/shmem_bw_put_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_bw_put_perf.c
@@ -33,7 +33,6 @@
 **
 **  -by default megabytes/second results
 **
-**NOTE: this test assumes correctness of reduction algorithm
 */
 #include <bw_common.h>
 #include <uni_dir.h>
@@ -46,7 +45,7 @@ int main(int argc, char *argv[])
 }
 
 void
-uni_dir_bw(int len, perf_metrics_t *metric_info)
+uni_dir_bw(int len, perf_metrics_t * const metric_info)
 {
     uni_bw_put(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c
new file mode 100644
index 0000000..6556f20
--- /dev/null
+++ b/test/performance/shmem_perf_suite/shmem_latency_get_ctx_perf.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2018 Intel Corporation. All rights reserved.
+ *  This software is available to you under the BSD license below:
+ *
+ * *	Redistribution and use in source and binary forms, with or
+ *	without modification, are permitted provided that the following
+ *	conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**
+**  This is a latency test for get.
+**
+**  Features of Test: latency using contexts driven by
+**  multiple threads.
+**
+**  -in micro seconds
+**
+*/
+
+#define ENABLE_OPENMP
+
+#include <latency_common.h>
+#include <latency_ctx.h>
+
+int main(int argc, char *argv[])
+{
+    latency_main_ctx(argc, argv, STYLE_GET);
+
+    return 0;
+}
+
+void
+streaming_latency(int len, perf_metrics_t * const metric_info)
+{
+    streaming_get_latency_ctx(len, metric_info, streaming_node(metric_info));
+}
diff --git a/test/performance/shmem_perf_suite/shmem_latency_get_perf.c b/test/performance/shmem_perf_suite/shmem_latency_get_perf.c
index a7ce4a8..523b19f 100644
--- a/test/performance/shmem_perf_suite/shmem_latency_get_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_latency_get_perf.c
@@ -42,49 +42,65 @@
 int main(int argc, char *argv[])
 {
 
-    latency_main(argc, argv);
+    latency_main(argc, argv, STYLE_GET);
 
     return 0;
 }  /* end of main() */
 
 
 void
-long_element_round_trip_latency(perf_metrics_t data)
+long_element_round_trip_latency(perf_metrics_t * const data)
 {
+#ifndef USE_NONBLOCKING_API
     long_element_round_trip_latency_get(data);
+#endif
 }
 
 void
-int_element_latency(perf_metrics_t data)
+int_element_latency(perf_metrics_t * const data)
 {
+#ifndef USE_NONBLOCKING_API
     int_g_latency(data);
+#endif
 }
 
 void
-streaming_latency(int len, perf_metrics_t *data)
+streaming_latency(int len, perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
     unsigned long int i = 0;
-    static int print_once = 0;
-    if(!print_once && data->my_node == GET_IO_NODE) {
-        printf("\nStreaming results for %d trials each of length %d through %d in"\
-              " powers of %d\n", data->trials, data->start_len,
-              data->max_len, data->inc);
-        print_results_header();
-        print_once++;
+    int dest = partner_node(metric_info);
+    int receiver = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
     }
 
-    if (data->my_node == 0) {
+    shmem_barrier_all();
+    if (receiver) {
 
-        for (i = 0; i < data->trials + data->warmup; i++) {
-            if(i == data->warmup)
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
-
-            shmem_getmem(data->dest, data->src, len, 1);
+#ifdef USE_NONBLOCKING_API
+            shmem_getmem_nbi(metric_info->dest, metric_info->src, len, dest);
+            shmem_quiet();
+#else
+            shmem_getmem(metric_info->dest, metric_info->src, len, dest);
+#endif
         }
         end = perf_shmemx_wtime();
 
-        calc_and_print_results(start, end, len, *data);
+        calc_and_print_results(start, end, len, metric_info);
     }
 } /* latency/bw for one-way trip */
diff --git a/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c b/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c
deleted file mode 100644
index 7346580..0000000
--- a/test/performance/shmem_perf_suite/shmem_latency_nb_get_perf.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright (c) 2018 Intel Corporation. All rights reserved.
- *  This software is available to you under the BSD license below:
- *
- *      Redistribution and use in source and binary forms, with or
- *      without modification, are permitted provided that the following
- *      conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
-**
-**  Notice: micro benchmark ~ two nodes only
-**
-**  Features of Test:
-**  1) small get latency test
-**  2) getmem latency test to calculate latency of various sizes
-**
-*/
-
-#include <latency_common.h>
-
-int main(int argc, char *argv[])
-{
-    latency_main(argc, argv);
-
-    return 0;
-}  /* end of main() */
-
-/* NO-OP for non-blocking */
-void
-long_element_round_trip_latency(perf_metrics_t data) {}
-
-void
-int_element_latency(perf_metrics_t data) {}
-
-void
-streaming_latency(int len, perf_metrics_t *data)
-{
-    double start = 0.0;
-    double end = 0.0;
-    unsigned long int i = 0;
-    static int print_once = 0;
-    if(!print_once && data->my_node == GET_IO_NODE) {
-        printf("\nStreaming results for %d trials each of length %d through %d in"\
-              " powers of %d\n", data->trials, data->start_len,
-              data->max_len, data->inc);
-        print_results_header();
-        print_once++;
-    }
-
-    if (data->my_node == 0) {
-
-        for (i = 0; i < data->trials + data->warmup; i++) {
-            if(i == data->warmup)
-                start = perf_shmemx_wtime();
-
-            shmem_getmem_nbi(data->dest, data->src, len, 1);
-            shmem_quiet();
-        }
-        end = perf_shmemx_wtime();
-
-        calc_and_print_results(start, end, len, *data);
-    }
-} /* latency/bw for one-way trip */
diff --git a/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c b/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c
deleted file mode 100644
index c9f4c3d..0000000
--- a/test/performance/shmem_perf_suite/shmem_latency_nb_put_perf.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  Copyright (c) 2018 Intel Corporation. All rights reserved.
- *  This software is available to you under the BSD license below:
- *
- *      Redistribution and use in source and binary forms, with or
- *      without modification, are permitted provided that the following
- *      conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
-**
-**  Notice: micro benchmark ~ two nodes only
-**
-**  Features of Test:
-**  1) small put pingpong latency test
-**  2) one sided latency test to calculate latency of various sizes
-**    to the network stack
-**
-*/
-
-#include <latency_common.h>
-
-int main(int argc, char *argv[])
-{
-    latency_main(argc, argv);
-
-    return 0;
-}  /* end of main() */
-
-
-/* NO-OP for non-blocking */
-void
-long_element_round_trip_latency(perf_metrics_t data) {}
-
-void
-int_element_latency(perf_metrics_t data) {}
-
-void
-streaming_latency(int len, perf_metrics_t *data)
-{
-    double start = 0.0;
-    double end = 0.0;
-    unsigned long int i = 0;
-    static int print_once = 0;
-    if(!print_once && data->my_node == PUT_IO_NODE) {
-        printf("\nStreaming results for %d trials each of length %d through %d in"\
-              " powers of %d\n", data->trials, data->start_len,
-              data->max_len, data->inc);
-        print_results_header();
-        print_once++;
-    }
-
-    /*puts to zero to match gets validation scheme*/
-    if (data->my_node == 1) {
-
-        for (i = 0; i < data->trials + data->warmup; i++) {
-            if(i == data->warmup)
-                start = perf_shmemx_wtime();
-
-            shmem_putmem_nbi(data->dest, data->src, len, 0);
-            shmem_quiet();
-
-        }
-        end = perf_shmemx_wtime();
-
-        calc_and_print_results(start, end, len, *data);
-    }
-} /* latency/bw for one-way trip */
diff --git a/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c b/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c
new file mode 100644
index 0000000..ffc6219
--- /dev/null
+++ b/test/performance/shmem_perf_suite/shmem_latency_put_ctx_perf.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2018 Intel Corporation. All rights reserved.
+ *  This software is available to you under the BSD license below:
+ *
+ * *	Redistribution and use in source and binary forms, with or
+ *	without modification, are permitted provided that the following
+ *	conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**
+**  This is a latency test for put.
+**
+**  Features of Test: latency using contexts driven by
+**  multiple threads.
+**
+**  -in micro seconds
+**
+*/
+
+#define ENABLE_OPENMP
+
+#include <latency_common.h>
+#include <latency_ctx.h>
+
+int main(int argc, char *argv[])
+{
+    latency_main_ctx(argc, argv, STYLE_PUT);
+
+    return 0;
+}
+
+void
+streaming_latency(int len, perf_metrics_t * const metric_info)
+{
+    streaming_put_latency_ctx(len, metric_info, streaming_node(metric_info));
+}
diff --git a/test/performance/shmem_perf_suite/shmem_latency_put_perf.c b/test/performance/shmem_perf_suite/shmem_latency_put_perf.c
index 97b2bd2..5cadc21 100644
--- a/test/performance/shmem_perf_suite/shmem_latency_put_perf.c
+++ b/test/performance/shmem_perf_suite/shmem_latency_put_perf.c
@@ -42,52 +42,67 @@
 
 int main(int argc, char *argv[])
 {
-    latency_main(argc, argv);
+    latency_main(argc, argv, STYLE_PUT);
 
     return 0;
 }  /* end of main() */
 
 
 void
-long_element_round_trip_latency(perf_metrics_t data)
+long_element_round_trip_latency(perf_metrics_t * const data)
 {
+#ifndef USE_NONBLOCKING_API
     long_element_round_trip_latency_put(data);
+#endif
 }
 
 void
-int_element_latency(perf_metrics_t data)
+int_element_latency(perf_metrics_t * const data)
 {
+#ifndef USE_NONBLOCKING_API
     int_p_latency(data);
+#endif
 }
 
 void
-streaming_latency(int len, perf_metrics_t *data)
+streaming_latency(int len, perf_metrics_t * const metric_info)
 {
     double start = 0.0;
     double end = 0.0;
     unsigned long int i = 0;
-    static int print_once = 0;
-    if(!print_once && data->my_node == PUT_IO_NODE) {
-        printf("\nStreaming results for %d trials each of length %d through %d in"\
-              " powers of %d\n", data->trials, data->start_len,
-              data->max_len, data->inc);
-        print_results_header();
-        print_once++;
+    int dest = partner_node(metric_info);
+    int sender = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
+    static int check_once = 0;
+
+    if (!check_once) {
+        /* check to see whether sender and receiver are the same process */
+        if (dest == metric_info->my_node) {
+            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n",
+                             dest);
+        }
+        /* hostname validation for all sender and receiver processes */
+        int status = check_hostname_validation(metric_info);
+        if (status != 0) return;
+        check_once++;
     }
 
+    shmem_barrier_all();
     /*puts to zero to match gets validation scheme*/
-    if (data->my_node == 1) {
-
-        for (i = 0; i < data->trials + data->warmup; i++) {
-            if(i == data->warmup)
+    if (sender) {
+        for (i = 0; i < metric_info->trials + metric_info->warmup; i++) {
+            if(i == metric_info->warmup)
                 start = perf_shmemx_wtime();
 
-            shmem_putmem(data->dest, data->src, len, 0);
+#ifdef USE_NONBLOCKING_API
+            shmem_putmem_nbi(metric_info->dest, metric_info->src, len, dest);
+#else
+            shmem_putmem(metric_info->dest, metric_info->src, len, dest);
+#endif
             shmem_quiet();
 
         }
         end = perf_shmemx_wtime();
 
-        calc_and_print_results(start, end, len, *data);
+        calc_and_print_results(start, end, len, metric_info);
     }
 } /* latency/bw for one-way trip */
diff --git a/test/performance/shmem_perf_suite/target_put.h b/test/performance/shmem_perf_suite/target_put.h
index 223a457..cac4b62 100644
--- a/test/performance/shmem_perf_suite/target_put.h
+++ b/test/performance/shmem_perf_suite/target_put.h
@@ -25,37 +25,49 @@
  * SOFTWARE.
  */
 
-int static inline get_size_of_side(perf_metrics_t my_info) {
-    if(my_info.my_node < my_info.midpt)
-        return my_info.szinitiator;
+static inline int get_size_of_side(const perf_metrics_t * const my_info) {
+    if(my_info->my_node < my_info->midpt)
+        return my_info->szinitiator;
     else
-        return my_info.sztarget;
+        return my_info->sztarget;
 }
 
-int static inline get_num_partners(perf_metrics_t my_info) {
-    int unused_PEs = 0, num_partners = 0, num_xtra_partners = 0;
+static inline int get_size_of_other_side(const perf_metrics_t * const my_info) {
+    if(my_info->my_node < my_info->midpt)
+        return my_info->sztarget;
+    else
+        return my_info->szinitiator;
+}
+
+static inline int get_num_partners(perf_metrics_t * const my_info, int snode) {
+    int unused_PEs = 0, num_partners = 0;
     int active_PEs = get_size_of_side(my_info);
+    int other_side = get_size_of_other_side(my_info);
 
-    if(active_PEs == my_info.midpt)
+    if(active_PEs >= other_side) 
         return 1;
 
-    unused_PEs = my_info.midpt - active_PEs;
-    num_partners = my_info.midpt / active_PEs;
-    num_xtra_partners = unused_PEs % active_PEs;
+    num_partners = other_side / active_PEs;
+    unused_PEs = other_side % active_PEs;
 
-    if((my_info.my_node % my_info.midpt) < num_xtra_partners)
-        num_partners++;
+    if (snode) {
+        if((my_info->my_node % active_PEs) < unused_PEs)
+            num_partners++;
+    } else {
+        if(((my_info->my_node - my_info->midpt) % active_PEs) < unused_PEs)
+            num_partners++;
+    }
 
     return num_partners;
 }
 
 /* target only needs to know num of partners */
-int static inline *get_initiators_partners(perf_metrics_t my_info, int num_partners) {
-    int node_to_shadow = my_info.my_node;
+static inline int *get_initiators_partners(const perf_metrics_t * const my_info, int num_partners) {
+    int node_to_shadow = my_info->my_node;
     int i = 0;
     int *partner_nodes = NULL;
 
-    assert(my_info.cstyle == COMM_PAIRWISE && !target_node(my_info));
+    assert(my_info->cstyle == COMM_PAIRWISE && !target_node(my_info));
     if(num_partners < 1)
         return partner_nodes;
 
@@ -63,37 +75,40 @@ int static inline *get_initiators_partners(perf_metrics_t my_info, int num_partn
     assert(partner_nodes);
 
     for(i = 0; i < num_partners; i++) {
-        partner_nodes[i] = ((node_to_shadow % my_info.sztarget) + my_info.midpt);
-        node_to_shadow += my_info.szinitiator;
+        partner_nodes[i] = ((node_to_shadow % my_info->sztarget) + my_info->midpt);
+        node_to_shadow += my_info->szinitiator;
     }
 
     return partner_nodes;
 }
 
-void static inline target_data_uni_bw(int len, perf_metrics_t metric_info)
+static inline void target_data_uni_bw(int len, perf_metrics_t * const metric_info)
 {
     double start = 0.0, end = 0.0;
     int i = 0;
-    unsigned long int j = 0;
-    int snode = (metric_info.num_pes != 1)? streaming_node(metric_info) : true;
-    int num_partners = get_num_partners(metric_info);
+    unsigned long int j, k;
+    int snode = (metric_info->num_pes != 1)? streaming_node(metric_info) : true;
+    int num_partners = get_num_partners(metric_info, snode);
     static int completion_signal = 0;
     int *my_PE_partners = (snode ?
         get_initiators_partners(metric_info, num_partners): NULL);
 
+    metric_info->num_partners = num_partners;
     shmem_barrier_all();
     if (target_node(metric_info)) {
         shmem_int_wait_until(&completion_signal, SHMEM_CMP_EQ, num_partners);
     } else if (snode) {
         for (i = 0; i < num_partners; i++) {
-            for(j = 0; j < metric_info.warmup; j++) {
+            for(j = 0; j < metric_info->warmup; j++) {
+                for(k = 0; k < metric_info->window_size; k++) {
 #ifdef USE_NONBLOCKING_API
-                shmem_putmem_nbi(metric_info.dest, metric_info.src, len, my_PE_partners[i]);
+                    shmem_putmem_nbi(metric_info->dest, metric_info->src, len, my_PE_partners[i]);
 #else
-                shmem_putmem(metric_info.dest, metric_info.src, len, my_PE_partners[i]);
+                    shmem_putmem(metric_info->dest, metric_info->src, len, my_PE_partners[i]);
 #endif
+                }
+                shmem_quiet();
             }
-            shmem_quiet();
             shmem_int_atomic_inc(&completion_signal, my_PE_partners[i]);
         }
     }
@@ -106,35 +121,29 @@ void static inline target_data_uni_bw(int len, perf_metrics_t metric_info)
         shmem_int_wait_until(&completion_signal, SHMEM_CMP_EQ, num_partners);
     } else if (snode) {
         for (i = 0; i < num_partners; i++) {
-            for(j = 0; j < metric_info.trials; j++) {
+            for(j = 0; j < metric_info->trials; j++) {
+                for(k = 0; k < metric_info->window_size; k++) {
 #ifdef USE_NONBLOCKING_API
-                shmem_putmem_nbi(metric_info.dest, metric_info.src, len, my_PE_partners[i]);
+                    shmem_putmem_nbi(metric_info->dest, metric_info->src, len, my_PE_partners[i]);
 #else
-                shmem_putmem(metric_info.dest, metric_info.src, len, my_PE_partners[i]);
+                    shmem_putmem(metric_info->dest, metric_info->src, len, my_PE_partners[i]);
 #endif
+                }
+                shmem_quiet();
             }
-            shmem_quiet();
             shmem_int_atomic_inc(&completion_signal, my_PE_partners[i]);
         }
     }
 
-    shmem_barrier_all();
     if (snode || target_node(metric_info)) {
         end = perf_shmemx_wtime();
         calc_and_print_results(end, start, len, metric_info);
     }
+    completion_signal = 0;
     free(my_PE_partners);
 }
 
-void static inline target_bw_itr(int len, perf_metrics_t *metric_info)
+static inline void target_bw_itr(int len, perf_metrics_t * const metric_info)
 {
-    target_data_uni_bw(len, *metric_info);
-
-    metric_info->start_len = TARGET_SZ_MAX;
-    len = TARGET_SZ_MAX;
-
-    target_data_uni_bw(len, *metric_info);
-
-    /* stopping upper layer from iterating, we are done */
-    metric_info->max_len = TARGET_SZ_MIN;
+    target_data_uni_bw(len, metric_info);
 }
diff --git a/test/performance/shmem_perf_suite/uni_dir.h b/test/performance/shmem_perf_suite/uni_dir.h
index c47aef5..d238077 100644
--- a/test/performance/shmem_perf_suite/uni_dir.h
+++ b/test/performance/shmem_perf_suite/uni_dir.h
@@ -26,15 +26,20 @@
  */
 #include <target_put.h>
 
-void static inline uni_bw_put(int len, perf_metrics_t *metric_info)
+static inline void uni_bw_put(int len, perf_metrics_t *metric_info)
 {
     double start = 0.0, end = 0.0;
     unsigned long int i = 0, j = 0;
-    int dest = partner_node(*metric_info);
-    int snode = (metric_info->num_pes != 1)? streaming_node(*metric_info) : true;
+    int dest = partner_node(metric_info);
+    int snode = (metric_info->num_pes != 1)? streaming_node(metric_info) : true;
     static int check_once = 0;
     static int fin = -1;
 
+    if(metric_info->target_data) {
+        target_bw_itr(len, metric_info);
+        return;
+    }
+
     if (!check_once) {
         /* check to see whether sender and receiver are the same process */
         if (dest == metric_info->my_node) {
@@ -42,16 +47,11 @@ void static inline uni_bw_put(int len, perf_metrics_t *metric_info)
                              dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
 
-    if(metric_info->target_data) {
-        target_bw_itr(len, metric_info);
-        return;
-    }
-
     shmem_barrier_all();
 
     if (snode) {
@@ -83,22 +83,27 @@ void static inline uni_bw_put(int len, perf_metrics_t *metric_info)
         shmem_int_p(&fin, 1, dest);
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0);
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     } else {
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1);
         shmem_int_p(&fin, 0, dest);
     }
 }
 
-void static inline uni_bw_get(int len, perf_metrics_t *metric_info)
+static inline void uni_bw_get(int len, perf_metrics_t *metric_info)
 {
     double start = 0.0, end = 0.0;
     unsigned long int i = 0, j = 0;
-    int dest = partner_node(*metric_info);
-    int snode = (metric_info->num_pes != 1)? streaming_node(*metric_info) : true;
+    int dest = partner_node(metric_info);
+    int snode = (metric_info->num_pes != 1) ? streaming_node(metric_info) : true;
     static int check_once = 0;
     static int fin = -1;
 
+    if(metric_info->target_data) {
+        target_bw_itr(len, metric_info);
+        return;
+    }
+
     if (!check_once) {
         /* check to see whether sender and receiver are the same process */
         if (dest == metric_info->my_node) {
@@ -106,16 +111,11 @@ void static inline uni_bw_get(int len, perf_metrics_t *metric_info)
                              dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
 
-    if(metric_info->target_data) {
-        target_bw_itr(len, metric_info);
-        return;
-    }
-
     shmem_barrier_all();
 
     if (snode) {
@@ -154,7 +154,7 @@ void static inline uni_bw_get(int len, perf_metrics_t *metric_info)
         shmem_int_p(&fin, 1, dest);
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 0);
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     } else {
         shmem_int_wait_until(&fin, SHMEM_CMP_EQ, 1);
         shmem_int_p(&fin, 0, dest);
diff --git a/test/performance/shmem_perf_suite/uni_dir_ctx.h b/test/performance/shmem_perf_suite/uni_dir_ctx.h
index fd46960..211906f 100644
--- a/test/performance/shmem_perf_suite/uni_dir_ctx.h
+++ b/test/performance/shmem_perf_suite/uni_dir_ctx.h
@@ -25,26 +25,22 @@
 * SOFTWARE.
 */
 
-
-void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
-        int streaming_node)
+static inline 
+void uni_bw_ctx(int len, perf_metrics_t *metric_info, int streaming_node)
 {
     double start = 0.0, end = 0.0;
-    int j = 0;
-    int dest = partner_node(*metric_info);
-    char *src = aligned_buffer_alloc(metric_info->nthreads * len);
-    char *dst = aligned_buffer_alloc(metric_info->nthreads * len);
-    assert(src && dst);
+    unsigned long int i, j;
+    int dest = partner_node(metric_info);
     static int check_once = 0;
 
     if (!check_once) {
         /* check to see whether sender and receiver are the same process */
         if (dest == metric_info->my_node) {
-            fprintf(stderr, "Warning: Sender and receiver are the same process (%d)\n", 
-                             dest);
+            fprintf(stderr, "Warning: Sender and receiver are the same "
+                            "process (%d)\n", dest);
         }
         /* hostname validation for all sender and receiver processes */
-        int status = check_hostname_validation(*metric_info);
+        int status = check_hostname_validation(metric_info);
         if (status != 0) return;
         check_once++;
     }
@@ -52,10 +48,9 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
     shmem_barrier_all();
 
     if (streaming_node) {
-#pragma omp parallel default(none) firstprivate(len, dest) private(j) \
-	shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads)
+#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
         {
-            int i;
             const int thread_id = omp_get_thread_num();
             shmem_ctx_t ctx;
             shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
@@ -63,9 +58,11 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
             for (i = 0; i < metric_info->warmup; i++) {
                 for (j = 0; j < metric_info->window_size; j++) {
 #ifdef USE_NONBLOCKING_API
-                    shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                         metric_info->src + thread_id * len, len, dest);
 #else
-                    shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
 #endif
                 }
                 shmem_ctx_quiet(ctx);
@@ -76,10 +73,9 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
 
     shmem_barrier_all();
     if (streaming_node) {
-#pragma omp parallel default(none) firstprivate(len, dest) private(j) \
-        shared(metric_info, src, dst, start, end) num_threads(metric_info->nthreads)
+#pragma omp parallel default(none) firstprivate(len, dest) private(i, j) \
+shared(metric_info, start, end) num_threads(metric_info->nthreads)
         {
-            int i;
             const int thread_id = omp_get_thread_num();
             shmem_ctx_t ctx;
             shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
@@ -93,9 +89,11 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
             for (i = 0; i < metric_info->trials; i++) {
                 for (j = 0; j < metric_info->window_size; j++) {
 #ifdef USE_NONBLOCKING_API
-                    shmem_ctx_putmem_nbi(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem_nbi(ctx, metric_info->dest + thread_id * len, 
+                                         metric_info->src + thread_id * len, len, dest);
 #else
-                    shmem_ctx_putmem(ctx, dst + thread_id * len, src + thread_id * len, len, dest);
+                    shmem_ctx_putmem(ctx, metric_info->dest + thread_id * len, 
+                                     metric_info->src + thread_id * len, len, dest);
 #endif
                 }
                 shmem_ctx_quiet(ctx);
@@ -107,12 +105,8 @@ void static inline uni_bw_ctx(int len, perf_metrics_t *metric_info,
     shmem_barrier_all();
     if (streaming_node) {
         end = perf_shmemx_wtime();
-        calc_and_print_results(end, start, len, *metric_info);
+        calc_and_print_results(end, start, len, metric_info);
     }
 
     shmem_barrier_all();
-
-    aligned_buffer_free(src);
-    aligned_buffer_free(dst);
-
 }
diff --git a/test/shmemx/Makefile.am b/test/shmemx/Makefile.am
index 091a7b9..2133897 100644
--- a/test/shmemx/Makefile.am
+++ b/test/shmemx/Makefile.am
@@ -26,7 +26,8 @@ endif
 if HAVE_PTHREADS
 if SHMEMX_TESTS
 check_PROGRAMS += \
-	gettid_register
+	gettid_register \
+	perf_counter
 endif
 endif
 
diff --git a/test/shmemx/gettid_register.c b/test/shmemx/gettid_register.c
index 2e7c82d..4697dfd 100644
--- a/test/shmemx/gettid_register.c
+++ b/test/shmemx/gettid_register.c
@@ -46,7 +46,13 @@ pthread_key_t key;
 static uint64_t my_gettid(void) {
     uint64_t tid_val = 0;
 
-    tid_val = * (uint64_t*) pthread_getspecific(key);
+    void* ret =  pthread_getspecific(key);
+    if (ret != NULL)
+        tid_val = * (uint64_t*) ret;
+    else  {
+        printf("Calling pthread_getspecific(key) returned NULL\n");
+        shmem_global_exit(3);
+    }
 
     return tid_val;
 }
diff --git a/test/shmemx/perf_counter.c b/test/shmemx/perf_counter.c
new file mode 100644
index 0000000..e6ee36f
--- /dev/null
+++ b/test/shmemx/perf_counter.c
@@ -0,0 +1,122 @@
+/*
+ *  Copyright (c) 2018 Intel Corporation. All rights reserved.
+ *  This software is available to you under the BSD license below:
+ *
+ *      Redistribution and use in source and binary forms, with or
+ *      without modification, are permitted provided that the following
+ *      conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Single-threaded test for validation of performance counter APIs
+*/
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <shmem.h>
+#include <shmemx.h>
+
+#define ITER 100
+#define WINDOW 64
+#define LENGTH 1024
+
+int me, npes;
+char *src_array, *dest_array;
+uint64_t c_put, c_get, p_put, p_get, target;
+
+static void collect(shmem_ctx_t ctx) {
+    shmemx_pcntr_get_completed_write(ctx, &c_put);
+    shmemx_pcntr_get_completed_read(ctx, &c_get);
+    shmemx_pcntr_get_completed_target(&target);
+    shmemx_pcntr_get_issued_write(ctx, &p_put);
+    shmemx_pcntr_get_issued_read(ctx, &p_get);
+}
+
+static void put_and_progress_check(void) {
+    int i, j;
+    int partner = ((npes % 2 == 0) ? (me % 2 == 0 ? me + 1 : me - 1) :
+                                     (me % 2 != 0 ? me - 1 :
+                                     (me == npes - 1) ? me : me + 1));
+
+    shmem_ctx_t ctx;
+    shmemx_pcntr_t pcntr;
+    shmem_ctx_create(SHMEM_CTX_PRIVATE, &ctx);
+
+    for (i = 0; i < ITER; i++) {
+        for (j = 0; j < WINDOW; j++) {
+            shmem_ctx_putmem_nbi(ctx, dest_array, src_array, LENGTH, partner);
+            collect(ctx);
+        }
+        shmem_ctx_quiet(ctx);
+    }
+
+    shmemx_pcntr_get_all(ctx, &pcntr);
+    shmem_ctx_destroy(ctx);
+
+    /* Report the counter values observed through get_all API after the loop
+     * completion. Except the target counter, other counter values should
+     * reflect the final expected value */
+    printf("Value observed of the performance counters from combined API: \n"
+           "Completed Put = %10"PRIu64"\n"
+           "Completed Get = %10"PRIu64"\n"
+           "Issued Put    = %10"PRIu64"\n"
+           "Issued Get    = %10"PRIu64"\n"
+           "Target        = %10"PRIu64"\n"
+           , pcntr.completed_put, pcntr.completed_get, pcntr.pending_put,
+           pcntr.pending_get, pcntr.target);
+
+    return;
+}
+
+int main(int argc, char **argv) {
+
+    shmem_init();
+
+    me = shmem_my_pe();
+    npes = shmem_n_pes();
+
+    src_array = shmem_malloc(LENGTH);
+    dest_array = shmem_malloc(LENGTH);
+
+    if (me == 0) {
+        printf("Performance counter API test with %d PEs\n", npes);
+    }
+
+    put_and_progress_check();
+    shmem_barrier_all();
+
+    /* Report the counter values observed through single parameter APIs in
+     * the final iteration. The values reported here may be less than the actual
+     * final value as they are captured before the barrier one counter at a time
+     * */
+    printf("Final value observed of the performance counters from individual APIs: \n"
+           "Completed Put = %10"PRIu64"\n"
+           "Completed Get = %10"PRIu64"\n"
+           "Issued Put    = %10"PRIu64"\n"
+           "Issued Get    = %10"PRIu64"\n"
+           "Target        = %10"PRIu64"\n"
+           , c_put, c_get, p_put, p_get, target);
+
+    shmem_free(dest_array);
+    shmem_free(src_array);
+
+    shmem_finalize();
+    return 0;
+}
diff --git a/test/unit/Makefile.am b/test/unit/Makefile.am
index d7a7580..7649c6d 100644
--- a/test/unit/Makefile.am
+++ b/test/unit/Makefile.am
@@ -61,7 +61,6 @@ check_PROGRAMS = \
 	lfinc \
 	shmem_info \
 	query_thread \
-	global_exit \
 	asym_alloc \
 	set_fetch \
 	alltoall \
@@ -99,6 +98,12 @@ check_PROGRAMS = \
 	many-ctx \
 	shmem_test
 
+# Temporarily disabled: Global exit test tends to fail with MPI-PMI
+if !USE_PMI_MPI
+check_PROGRAMS += \
+	global_exit
+endif
+
 if ENABLE_PROFILING
 check_PROGRAMS += \
 	rma_coverage_pshmem
@@ -182,9 +187,9 @@ rma_coverage_pshmem_CFLAGS = -DTEST_PSHMEM
 query_thread_funneled_SOURCES = query_thread.c
 query_thread_funneled_CFLAGS = -DENABLE_THREADS
 
-mt_a2a_SOURCES = mt_a2a.c pthread_barrier.h
+mt_a2a_SOURCES = mt_a2a.c
 mt_a2a_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
-mt_a2a_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS)
+mt_a2a_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS)
 mt_a2a_LDADD = $(LDADD) $(PTHREAD_CFLAGS)
 
 mt_contention_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
@@ -197,11 +202,11 @@ mt_lock_test_CFLAGS = $(PTHREAD_CFLAGS)
 mt_lock_test_LDADD = $(LDADD) $(PTHREAD_CFLAGS)
 
 mt_membar_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
-mt_membar_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS)
+mt_membar_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS)
 mt_membar_LDADD = $(LDADD) $(PTHREAD_CFLAGS)
 
 threading_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
-threading_CFLAGS = -I$(top_srcdir)/test/unit $(PTHREAD_CFLAGS)
+threading_CFLAGS = -I$(top_srcdir)/test/include $(PTHREAD_CFLAGS)
 threading_LDADD = $(LDADD) $(PTHREAD_CFLAGS)
 
 web_LDFLAGS = $(AM_LDFLAGS) $(PTHREAD_LIBS)
diff --git a/test/unit/reduce_active_set.c b/test/unit/reduce_active_set.c
index b307342..1a05788 100644
--- a/test/unit/reduce_active_set.c
+++ b/test/unit/reduce_active_set.c
@@ -58,7 +58,7 @@ int main(void)
 
     for (i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) {
         max_psync[i] = SHMEM_SYNC_VALUE;
-        max_psync[i] = SHMEM_SYNC_VALUE;
+        min_psync[i] = SHMEM_SYNC_VALUE;
     }
 
     if (me == 0)