Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix gatherv overflow when the combined packed mesh size exceeds 2^31. #117

Merged
merged 10 commits into from
Aug 6, 2024
77 changes: 54 additions & 23 deletions src/mergemesh_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1098,10 +1098,10 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
int **rcv_next_node_comm,
PMMG_pExt_comm **rcv_ext_node_comm ) {

size_t pack_size_tot,next_disp;
int *rcv_pack_size,ier,ier_glob,k,*displs,ier_pack;
size_t pack_size_tot,next_disp,*displs,buf_idx;
int *rcv_pack_size,ier,ier_glob,k,ier_pack;
int nprocs,root,pack_size;
char *rcv_buffer,*buffer,*ptr;
char *rcv_buffer,*ptr_to_free,*buffer;

nprocs = parmesh->nprocs;
root = parmesh->info.root;
Expand All @@ -1120,7 +1120,7 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
/** 1: Memory alloc */
if ( parmesh->myrank == root ) {
PMMG_MALLOC( parmesh, rcv_pack_size ,nprocs,int,"rcv_pack_size",ier=0);
PMMG_MALLOC( parmesh, displs ,nprocs,int,"displs for gatherv",ier=0);
PMMG_MALLOC( parmesh, displs ,nprocs,size_t,"displs for gatherv",ier=0);
PMMG_CALLOC( parmesh, (*rcv_grps) ,nprocs,PMMG_Grp,"rcv_grps",ier=0);
PMMG_MALLOC( parmesh, (*rcv_int_node_comm) ,nprocs,PMMG_Int_comm,"rcv_int_comm" ,ier=0);
PMMG_MALLOC( parmesh, (*rcv_next_node_comm),nprocs,int,"rcv_next_comm" ,ier=0);
Expand All @@ -1144,26 +1144,33 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
if ( parmesh->myrank == root ) {
displs[0] = 0;
for ( k=1; k<nprocs; ++k ) {
assert ( displs[k-1] <= INT_MAX - rcv_pack_size[k-1] && "INT_MAX overflow");
next_disp = displs[k-1] + rcv_pack_size[k-1];
if(next_disp>INT_MAX){
/* The displacements argument to MPI_Gatherv() is an array of int
* (signed) so the number of elements must be smaller than 2^31.
* To get around this we must pack more data in a single element
* or use multiple messages.
*/
fprintf(stderr, " ## Error: too many elements for MPI_Gatherv()\n");
MPI_Abort(parmesh->comm, 1); /* error detected only on root */
}
displs[k] = next_disp;
}

/* On root, we will gather all the meshes in rcv_buffer so we have to
* compute the total pack size */
pack_size_tot = (size_t)(displs[nprocs-1])+(size_t)(rcv_pack_size[nprocs-1]);
assert ( pack_size_tot < SIZE_MAX && "SIZE_MAX overflow" );
PMMG_MALLOC( parmesh,rcv_buffer,pack_size_tot,char,"rcv_buffer",ier=0);

/* root will write directly in the suitable position of rcv_buffer */
buf_idx = displs[root];
}
else {
/* on ranks other than root we just need to store the local mesh so buffer
* will be of size pack_size */
pack_size_tot = pack_size;
/* we will write the mesh at the starting position */
buf_idx = 0;
}

PMMG_MALLOC( parmesh,rcv_buffer,pack_size_tot,char,"rcv_buffer",ier=0);

/* Parmesh compression */
PMMG_MALLOC ( parmesh,buffer,pack_size,char,"buffer to send",ier=0 );
buffer = &rcv_buffer[buf_idx];

/* Save input allocated address to avoid arrors at unalloc */
ptr_to_free = rcv_buffer;

#ifndef NDEBUG
/* Remark: in release mode, a non allocated buffer used in gatherv creates a
Expand All @@ -1174,17 +1181,40 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
}
#endif

ptr = buffer;
/* /!\ mpipack_parmesh and mpiunpack_parmesh are modifying the buffer pointer
* making it not valid for realloc / unalloc */

/* Save adress of buffer because it will be set the the end of the char array
by the \a PMMG_mpipack_parmesh function */
char *buffer_to_send = buffer;
ier_pack = PMMG_mpipack_parmesh ( parmesh ,&buffer );

/* Do not use \a buffer pointer after this call: it points toward the end of
* the packed array which is useless */
buffer = NULL;

assert ( ier_pack );

/* Gather the packed parmeshes */
ier = MG_MIN ( ier, ier_pack );
MPI_CHECK( MPI_Gatherv ( ptr,pack_size,MPI_CHAR,
rcv_buffer,rcv_pack_size,
displs,MPI_CHAR,root,parmesh->comm ),ier=0 );

PMMG_DEL_MEM(parmesh,ptr,char,"buffer to send");
/* Here the gatherv call has been replaced by a send/recv to avoid errors when
* displacements overflow the INT_MAX value */
if (parmesh->myrank == root) {
int i;
for ( i = 0; i < nprocs; ++i ) {
if ( i != root ) {
MPI_CHECK(
MPI_Recv(rcv_buffer + displs[i], rcv_pack_size[i], MPI_CHAR, i,
MPI_MERGEMESH_TAG, parmesh->comm, MPI_STATUS_IGNORE),
ier = 0);
}
}
} else {
MPI_CHECK(
MPI_Send(buffer_to_send, pack_size, MPI_CHAR, root, MPI_MERGEMESH_TAG,parmesh->comm),
ier = 0);
}

/** 4: Unpack parmeshes */
#ifndef NDEBUG
Expand All @@ -1195,7 +1225,6 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
#endif

if ( parmesh->myrank == root ) {
ptr = rcv_buffer;
for ( k=0; k<nprocs; ++k ) {
ier_pack = PMMG_mpiunpack_parmesh ( parmesh,(*rcv_grps),k,(*rcv_int_node_comm)+k,
(*rcv_next_node_comm)+k,(*rcv_ext_node_comm)+k,
Expand All @@ -1208,7 +1237,9 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
/* Free temporary arrays */
PMMG_DEL_MEM(parmesh,rcv_pack_size,int,"rcv_pack_size");
PMMG_DEL_MEM(parmesh,displs,int,"displs");
PMMG_DEL_MEM(parmesh,ptr ,char,"rcv_buffer");
/* the address of rcv_buffer is modified by packing/unpacking so it is needed
* to send the initially allocated address stored in to the unalloc macro */
PMMG_DEL_MEM(parmesh,ptr_to_free,char,"rcv_buffer");

return ier;
}
Expand Down
2 changes: 1 addition & 1 deletion src/mpi_pmmg.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
#define MPI_TRANSFER_GRP_TAG 8000
#define MPI_COMMUNICATORS_REF_TAG 9000
#define MPI_ANALYS_TAG 10000

#define MPI_MERGEMESH_TAG 11000

#define MPI_CHECK(func_call,on_failure) do { \
int mpi_ret_val; \
Expand Down
2 changes: 2 additions & 0 deletions src/mpipack_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,8 @@ int PMMG_mpipack_grp ( PMMG_pGrp grp,char **buffer ) {
* buffer pointer at the end of the written area. The parmesh groups must have
* been merged before entering this function.
*
* \remark the \a buffer pointer is modified (shifted) thus, after this
* function, it cannot be used for deallocation anymore
*/
int PMMG_mpipack_parmesh ( PMMG_pParMesh parmesh ,char **buffer ) {
PMMG_pGrp grp;
Expand Down
3 changes: 3 additions & 0 deletions src/mpiunpack_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1112,6 +1112,9 @@ int PMMG_mpiunpack_grp ( PMMG_pParMesh parmesh,PMMG_pGrp listgrp,int igrp,char *
* pointer toward a buffer of type "x". Then we can get the variable value by
* dereferencing the adress of the buffer.
*
* \remark the \a buffer pointer is modified (shifted) thus, after this
* function, it cannot be used for deallocation anymore
*
*/
int PMMG_mpiunpack_parmesh ( PMMG_pParMesh parmesh,PMMG_pGrp listgrp,int igrp,
PMMG_pInt_comm int_node_comm,
Expand Down