From 87cc55c1a5c70a034cd9d2838a4552c7715fde3a Mon Sep 17 00:00:00 2001
From: ksang <kaitoy@qq.com>
Date: Tue, 16 May 2017 17:34:18 +0800
Subject: [PATCH 1/3] remove duplicate definition in tests

---
 tests/test_utils.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_utils.h b/tests/test_utils.h
index 618ffac..707ef26 100644
--- a/tests/test_utils.h
+++ b/tests/test_utils.h
@@ -1,5 +1,5 @@
 #pragma once
-
+/*
 #ifndef USE_PROF
 struct prof { };
 
@@ -14,7 +14,7 @@ static inline int  prof_enabled(struct prof *p) { return 0; }
 static inline void prof_disable(struct prof *p) {}
 static inline void prof_reset(struct prof *p) {}
 #endif
-
+*/
 typedef int64_t gds_us_t;
 static inline gds_us_t gds_get_time_us()
 {
@@ -43,9 +43,9 @@ static void gds_cpu_relax(void)
 }
 
 static void gds_wmb(void) __attribute__((unused)) ;
-static void gds_wmb(void) 
+static void gds_wmb(void)
 {
-	asm volatile("sync") ; 
+	asm volatile("sync") ;
 }
 #else
 #error "platform not supported"

From b6b2313f73840b8f02908e1ce9c2c9437a532368 Mon Sep 17 00:00:00 2001
From: ksang <kaitoy@qq.com>
Date: Thu, 18 May 2017 18:07:28 +0800
Subject: [PATCH 2/3] make test working with RoCE

---
 tests/gds_kernel_latency.c          | 56 ++++++++++++++++++++---------
 tests/gds_kernel_loopback_latency.c | 42 ++++++++++++++--------
 2 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c
index 7641280..6929641 100644
--- a/tests/gds_kernel_latency.c
+++ b/tests/gds_kernel_latency.c
@@ -1,9 +1,9 @@
 /*
  * GPUDirect Async latency benchmark
- * 
+ *
  *
  * based on OFED libibverbs ud_pingpong test.
- * minimally changed to use MPI for bootstrapping, 
+ * minimally changed to use MPI for bootstrapping,
  */
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
@@ -193,8 +193,22 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
 
 	ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
 	if (!ctx->ah) {
-		fprintf(stderr, "Failed to create AH\n");
-		return 1;
+		union ibv_gid dgid;
+		if (ibv_query_gid(ctx->context, port, 0, &dgid)) {
+			fprintf(stderr, "Failed to query interface gid\n");
+			return 1;
+		}
+
+		ah_attr.is_global = 1;
+		ah_attr.grh.hop_limit = 1;
+		ah_attr.grh.dgid = dgid;
+		ah_attr.grh.sgid_index = 0;
+
+		ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
+		if (!ctx->ah) {
+			fprintf(stderr, "Failed to create AH\n");
+			return 1;
+		}
 	}
 
 	return 0;
@@ -249,7 +263,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
 
         ctx->rx_flag =  memalign(page_size, alloc_size);
         if (!ctx->rx_flag) {
-                fprintf(stderr, "Couldn't allocate rx_flag buf\n");  
+                fprintf(stderr, "Couldn't allocate rx_flag buf\n");
                 goto clean_ctx;
         }
 
@@ -367,8 +381,8 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
 
 clean_buffer:
 	if (ctx->gpu_id >= 0)
-		gpu_free(ctx->buf); 
-	else 
+		gpu_free(ctx->buf);
+	else
 		free(ctx->buf);
 
 clean_ctx:
@@ -408,8 +422,8 @@ int pp_close_ctx(struct pingpong_context *ctx)
 	}
 
 	if (ctx->gpu_id >= 0)
-		gpu_free(ctx->buf); 
-	else 
+		gpu_free(ctx->buf);
+	else
 		free(ctx->buf);
 
 	if (ctx->gpu_id >= 0)
@@ -509,7 +523,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
 
         posted_recv = pp_post_recv(ctx, n_posts);
         if (posted_recv < 0) {
-                fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n", 
+                fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
                         posted_recv, n_posts, is_client);
                 exit(EXIT_FAILURE);
                 return 0;
@@ -518,7 +532,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
                 if (!posted_recv)
                         return 0;
         }
-        
+
         PROF(&prof, prof_idx++);
 
 	for (i = 0; i < posted_recv; ++i) {
@@ -630,7 +644,7 @@ int main(int argc, char *argv[])
         MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));
         MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &my_rank));
 
-        if (comm_size != 2) { 
+        if (comm_size != 2) {
                 fprintf(stderr, "this test requires exactly two processes \n");
                 MPI_Abort(MPI_COMM_WORLD, -1);
         }
@@ -785,7 +799,7 @@ int main(int argc, char *argv[])
         MPI_CHECK(MPI_Get_processor_name(hostnames[my_rank], &name_len));
         assert(name_len < MPI_MAX_PROCESSOR_NAME);
 
-        MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, 
+        MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                 hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD));
 
         if (my_rank == 1) {
@@ -815,7 +829,7 @@ int main(int argc, char *argv[])
 
         if (!ib_devname) {
                 // old env var, for compatibility
-                const char *value = getenv("USE_IB_HCA"); 
+                const char *value = getenv("USE_IB_HCA");
                 if (value != NULL) {
                         printf("[%d] USE_IB_HCA: <%s>\n", my_rank, value);
                         ib_devname = value;
@@ -876,7 +890,7 @@ int main(int argc, char *argv[])
 
 	struct pingpong_dest all_dest[4] = {{0,}};
         all_dest[my_rank] = my_dest;
-        MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, 
+        MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                 all_dest, sizeof(all_dest[0]), MPI_CHAR, MPI_COMM_WORLD));
         rem_dest = &all_dest[my_rank?0:1];
 	inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid);
@@ -918,8 +932,16 @@ int main(int argc, char *argv[])
 
                 ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
                 if (!ctx->ah) {
-                        fprintf(stderr, "Failed to create AH\n");
-                        return 1;
+            		ah_attr.is_global = 1;
+            		ah_attr.grh.hop_limit = 1;
+            		ah_attr.grh.dgid = my_dest.gid;
+            		ah_attr.grh.sgid_index = 0;
+
+            		ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
+            		if (!ctx->ah) {
+            			fprintf(stderr, "Failed to create AH\n");
+            			return 1;
+            		}
                 }
 
         }
diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c
index 34ab332..f67fbbd 100644
--- a/tests/gds_kernel_loopback_latency.c
+++ b/tests/gds_kernel_loopback_latency.c
@@ -1,6 +1,6 @@
 /*
  * GPUDirect Async loopback latency benchmark
- * 
+ *
  *
  * based on OFED libibverbs ud_pingpong test.
  */
@@ -177,8 +177,22 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
 
 	ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
 	if (!ctx->ah) {
-		fprintf(stderr, "Failed to create AH\n");
-		return 1;
+		union ibv_gid dgid;
+		if (ibv_query_gid(ctx->context, port, 0, &dgid)) {
+			fprintf(stderr, "Failed to query interface gid\n");
+			return 1;
+		}
+
+		ah_attr.is_global = 1;
+		ah_attr.grh.hop_limit = 1;
+		ah_attr.grh.dgid = dgid;
+		ah_attr.grh.sgid_index = 0;
+
+		ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
+		if (!ctx->ah) {
+			fprintf(stderr, "Failed to create AH\n");
+			return 1;
+		}
 	}
 
 	return 0;
@@ -198,7 +212,7 @@ static struct pingpong_dest *pp_client_exch_dest(const char *servername, int por
 	int sockfd = -1;
 	struct pingpong_dest *rem_dest = NULL;
 	char gid[33];
-	
+
 	fprintf(stderr, "%04x:%06x:%06x:%s\n", my_dest->lid, my_dest->qpn,
                 my_dest->psn, (char *)&my_dest->gid);
 	rem_dest = malloc(sizeof *rem_dest);
@@ -267,7 +281,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
 
         ctx->rx_flag =  memalign(page_size, alloc_size);
         if (!ctx->rx_flag) {
-                fprintf(stderr, "Couldn't allocate rx_flag buf\n");  
+                fprintf(stderr, "Couldn't allocate rx_flag buf\n");
                 goto clean_ctx;
         }
 
@@ -341,7 +355,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                 },
                 .qp_type = IBV_QPT_UD,
         };
-	
+
 	//why?
         if (my_rank == 1) {
                 printf("sleeping 2s\n");
@@ -395,8 +409,8 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
 
 clean_buffer:
 	if (ctx->gpumem)
-		gpu_free(ctx->buf); 
-	else 
+		gpu_free(ctx->buf);
+	else
 		free(ctx->buf);
 
 clean_ctx:
@@ -436,8 +450,8 @@ int pp_close_ctx(struct pingpong_context *ctx)
 	}
 
 	if (ctx->gpumem)
-		gpu_free(ctx->buf); 
-	else 
+		gpu_free(ctx->buf);
+	else
 		free(ctx->buf);
 
 	if (ctx->gpu_id >= 0)
@@ -539,7 +553,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
 
         posted_recv = pp_post_recv(ctx, n_posts);
         if (posted_recv < 0) {
-                fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n", 
+                fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
                         posted_recv, n_posts, is_client);
                 exit(EXIT_FAILURE);
                 return 0;
@@ -548,7 +562,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
                 if (!posted_recv)
                         return 0;
         }
-        
+
         PROF(&prof, prof_idx++);
 
 	for (i = 0; i < posted_recv; ++i) {
@@ -897,7 +911,7 @@ int main(int argc, char *argv[])
 	inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof gid);
 	printf("  local address:  LID 0x%04x, QPN 0x%06x, PSN 0x%06x: GID %s\n",
 	       my_dest.lid, my_dest.qpn, my_dest.psn, gid);
-	
+
         rem_dest = pp_client_exch_dest(servername, port, &my_dest);
 
 	if (!rem_dest) {
@@ -1153,7 +1167,7 @@ int main(int argc, char *argv[])
         prof_destroy(&prof);
 
 	//ibv_ack_cq_events(ctx->cq, num_cq_events);
-	
+
 
 	return 0;
 

From 83457e356b1ecc73645d8fa4904d0ac92e35aff8 Mon Sep 17 00:00:00 2001
From: Ying Zhi <zhi.ying@emc.com>
Date: Fri, 19 May 2017 13:38:24 +0800
Subject: [PATCH 3/3] Revert "remove duplicate definition in tests"

This reverts commit 87cc55c1a5c70a034cd9d2838a4552c7715fde3a.
---
 tests/test_utils.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_utils.h b/tests/test_utils.h
index 707ef26..618ffac 100644
--- a/tests/test_utils.h
+++ b/tests/test_utils.h
@@ -1,5 +1,5 @@
 #pragma once
-/*
+
 #ifndef USE_PROF
 struct prof { };
 
@@ -14,7 +14,7 @@ static inline int  prof_enabled(struct prof *p) { return 0; }
 static inline void prof_disable(struct prof *p) {}
 static inline void prof_reset(struct prof *p) {}
 #endif
-*/
+
 typedef int64_t gds_us_t;
 static inline gds_us_t gds_get_time_us()
 {
@@ -43,9 +43,9 @@ static void gds_cpu_relax(void)
 }
 
 static void gds_wmb(void) __attribute__((unused)) ;
-static void gds_wmb(void)
+static void gds_wmb(void) 
 {
-	asm volatile("sync") ;
+	asm volatile("sync") ; 
 }
 #else
 #error "platform not supported"