Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RoCE support for tests #13

Open
wants to merge 3 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 39 additions & 17 deletions tests/gds_kernel_latency.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/*
* GPUDirect Async latency benchmark
*
*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please address whitespace issues in a different pull req ?

*
* based on OFED libibverbs ud_pingpong test.
* minimally changed to use MPI for bootstrapping,
* minimally changed to use MPI for bootstrapping,
*/
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
Expand Down Expand Up @@ -193,8 +193,22 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
union ibv_gid dgid;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could add a comment describing this fallback code.
e.g.. "this code is required for RoCE V1 and/or V2 blabla"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ferasd could you please review this change ?
as a context, it is required to run this test in RoCE mode.

if (ibv_query_gid(ctx->context, port, 0, &dgid)) {
fprintf(stderr, "Failed to query interface gid\n");
return 1;
}

ah_attr.is_global = 1;
ah_attr.grh.hop_limit = 1;
ah_attr.grh.dgid = dgid;
ah_attr.grh.sgid_index = 0;

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
}
}

return 0;
Expand Down Expand Up @@ -249,7 +263,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,

ctx->rx_flag = memalign(page_size, alloc_size);
if (!ctx->rx_flag) {
fprintf(stderr, "Couldn't allocate rx_flag buf\n");
fprintf(stderr, "Couldn't allocate rx_flag buf\n");
goto clean_ctx;
}

Expand Down Expand Up @@ -367,8 +381,8 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,

clean_buffer:
if (ctx->gpu_id >= 0)
gpu_free(ctx->buf);
else
gpu_free(ctx->buf);
else
free(ctx->buf);

clean_ctx:
Expand Down Expand Up @@ -408,8 +422,8 @@ int pp_close_ctx(struct pingpong_context *ctx)
}

if (ctx->gpu_id >= 0)
gpu_free(ctx->buf);
else
gpu_free(ctx->buf);
else
free(ctx->buf);

if (ctx->gpu_id >= 0)
Expand Down Expand Up @@ -509,7 +523,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin

posted_recv = pp_post_recv(ctx, n_posts);
if (posted_recv < 0) {
fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
posted_recv, n_posts, is_client);
exit(EXIT_FAILURE);
return 0;
Expand All @@ -518,7 +532,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
if (!posted_recv)
return 0;
}

PROF(&prof, prof_idx++);

for (i = 0; i < posted_recv; ++i) {
Expand Down Expand Up @@ -630,7 +644,7 @@ int main(int argc, char *argv[])
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &my_rank));

if (comm_size != 2) {
if (comm_size != 2) {
fprintf(stderr, "this test requires exactly two processes \n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
Expand Down Expand Up @@ -785,7 +799,7 @@ int main(int argc, char *argv[])
MPI_CHECK(MPI_Get_processor_name(hostnames[my_rank], &name_len));
assert(name_len < MPI_MAX_PROCESSOR_NAME);

MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD));

if (my_rank == 1) {
Expand Down Expand Up @@ -815,7 +829,7 @@ int main(int argc, char *argv[])

if (!ib_devname) {
// old env var, for compatibility
const char *value = getenv("USE_IB_HCA");
const char *value = getenv("USE_IB_HCA");
if (value != NULL) {
printf("[%d] USE_IB_HCA: <%s>\n", my_rank, value);
ib_devname = value;
Expand Down Expand Up @@ -876,7 +890,7 @@ int main(int argc, char *argv[])

struct pingpong_dest all_dest[4] = {{0,}};
all_dest[my_rank] = my_dest;
MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
all_dest, sizeof(all_dest[0]), MPI_CHAR, MPI_COMM_WORLD));
rem_dest = &all_dest[my_rank?0:1];
inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid);
Expand Down Expand Up @@ -918,8 +932,16 @@ int main(int argc, char *argv[])

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
ah_attr.is_global = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as before, add a comment here

ah_attr.grh.hop_limit = 1;
ah_attr.grh.dgid = my_dest.gid;
ah_attr.grh.sgid_index = 0;

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
}
}

}
Expand Down
42 changes: 28 additions & 14 deletions tests/gds_kernel_loopback_latency.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* GPUDirect Async loopback latency benchmark
*
*
*
* based on OFED libibverbs ud_pingpong test.
*/
Expand Down Expand Up @@ -177,8 +177,22 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
union ibv_gid dgid;
if (ibv_query_gid(ctx->context, port, 0, &dgid)) {
fprintf(stderr, "Failed to query interface gid\n");
return 1;
}

ah_attr.is_global = 1;
ah_attr.grh.hop_limit = 1;
ah_attr.grh.dgid = dgid;
ah_attr.grh.sgid_index = 0;

ctx->ah = ibv_create_ah(ctx->pd, &ah_attr);
if (!ctx->ah) {
fprintf(stderr, "Failed to create AH\n");
return 1;
}
}

return 0;
Expand All @@ -198,7 +212,7 @@ static struct pingpong_dest *pp_client_exch_dest(const char *servername, int por
int sockfd = -1;
struct pingpong_dest *rem_dest = NULL;
char gid[33];

fprintf(stderr, "%04x:%06x:%06x:%s\n", my_dest->lid, my_dest->qpn,
my_dest->psn, (char *)&my_dest->gid);
rem_dest = malloc(sizeof *rem_dest);
Expand Down Expand Up @@ -267,7 +281,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,

ctx->rx_flag = memalign(page_size, alloc_size);
if (!ctx->rx_flag) {
fprintf(stderr, "Couldn't allocate rx_flag buf\n");
fprintf(stderr, "Couldn't allocate rx_flag buf\n");
goto clean_ctx;
}

Expand Down Expand Up @@ -341,7 +355,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
},
.qp_type = IBV_QPT_UD,
};

//why?
if (my_rank == 1) {
printf("sleeping 2s\n");
Expand Down Expand Up @@ -395,8 +409,8 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,

clean_buffer:
if (ctx->gpumem)
gpu_free(ctx->buf);
else
gpu_free(ctx->buf);
else
free(ctx->buf);

clean_ctx:
Expand Down Expand Up @@ -436,8 +450,8 @@ int pp_close_ctx(struct pingpong_context *ctx)
}

if (ctx->gpumem)
gpu_free(ctx->buf);
else
gpu_free(ctx->buf);
else
free(ctx->buf);

if (ctx->gpu_id >= 0)
Expand Down Expand Up @@ -539,7 +553,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin

posted_recv = pp_post_recv(ctx, n_posts);
if (posted_recv < 0) {
fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
fprintf(stderr,"ERROR: can't post recv (%d) n_posts=%d is_client=%d\n",
posted_recv, n_posts, is_client);
exit(EXIT_FAILURE);
return 0;
Expand All @@ -548,7 +562,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
if (!posted_recv)
return 0;
}

PROF(&prof, prof_idx++);

for (i = 0; i < posted_recv; ++i) {
Expand Down Expand Up @@ -897,7 +911,7 @@ int main(int argc, char *argv[])
inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof gid);
printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x: GID %s\n",
my_dest.lid, my_dest.qpn, my_dest.psn, gid);

rem_dest = pp_client_exch_dest(servername, port, &my_dest);

if (!rem_dest) {
Expand Down Expand Up @@ -1153,7 +1167,7 @@ int main(int argc, char *argv[])
prof_destroy(&prof);

//ibv_ack_cq_events(ctx->cq, num_cq_events);


return 0;

Expand Down