Skip to content

Commit

Permalink
Merge pull request #117 from MmgTools/wissambouymedj-feature/fix-cent…
Browse files Browse the repository at this point in the history
…ralized-output

Fix gatherv overflow when the combined packed mesh size exceeds 2^31.
  • Loading branch information
Algiane authored Aug 6, 2024
2 parents babaf07 + 6dac366 commit 5cbeb4d
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 24 deletions.
77 changes: 54 additions & 23 deletions src/mergemesh_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1098,10 +1098,10 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
int **rcv_next_node_comm,
PMMG_pExt_comm **rcv_ext_node_comm ) {

size_t pack_size_tot,next_disp;
int *rcv_pack_size,ier,ier_glob,k,*displs,ier_pack;
size_t pack_size_tot,next_disp,*displs,buf_idx;
int *rcv_pack_size,ier,ier_glob,k,ier_pack;
int nprocs,root,pack_size;
char *rcv_buffer,*buffer,*ptr;
char *rcv_buffer,*ptr_to_free,*buffer;

nprocs = parmesh->nprocs;
root = parmesh->info.root;
Expand All @@ -1120,7 +1120,7 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
/** 1: Memory alloc */
if ( parmesh->myrank == root ) {
PMMG_MALLOC( parmesh, rcv_pack_size ,nprocs,int,"rcv_pack_size",ier=0);
PMMG_MALLOC( parmesh, displs ,nprocs,int,"displs for gatherv",ier=0);
PMMG_MALLOC( parmesh, displs ,nprocs,size_t,"displs for gatherv",ier=0);
PMMG_CALLOC( parmesh, (*rcv_grps) ,nprocs,PMMG_Grp,"rcv_grps",ier=0);
PMMG_MALLOC( parmesh, (*rcv_int_node_comm) ,nprocs,PMMG_Int_comm,"rcv_int_comm" ,ier=0);
PMMG_MALLOC( parmesh, (*rcv_next_node_comm),nprocs,int,"rcv_next_comm" ,ier=0);
Expand All @@ -1144,26 +1144,33 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
if ( parmesh->myrank == root ) {
displs[0] = 0;
for ( k=1; k<nprocs; ++k ) {
assert ( displs[k-1] <= INT_MAX - rcv_pack_size[k-1] && "INT_MAX overflow");
next_disp = displs[k-1] + rcv_pack_size[k-1];
if(next_disp>INT_MAX){
/* The displacements argument to MPI_Gatherv() is an array of int
* (signed) so the number of elements must be smaller than 2^31.
* To get around this we must pack more data in a single element
* or use multiple messages.
*/
fprintf(stderr, " ## Error: too many elements for MPI_Gatherv()\n");
MPI_Abort(parmesh->comm, 1); /* error detected only on root */
}
displs[k] = next_disp;
}

/* On root, we will gather all the meshes in rcv_buffer so we have to
* compute the total pack size */
pack_size_tot = (size_t)(displs[nprocs-1])+(size_t)(rcv_pack_size[nprocs-1]);
assert ( pack_size_tot < SIZE_MAX && "SIZE_MAX overflow" );
PMMG_MALLOC( parmesh,rcv_buffer,pack_size_tot,char,"rcv_buffer",ier=0);

/* root will write directly in the suitable position of rcv_buffer */
buf_idx = displs[root];
}
else {
/* on ranks other than root we just need to store the local mesh so buffer
* will be of size pack_size */
pack_size_tot = pack_size;
/* we will write the mesh at the starting position */
buf_idx = 0;
}

PMMG_MALLOC( parmesh,rcv_buffer,pack_size_tot,char,"rcv_buffer",ier=0);

/* Parmesh compression */
PMMG_MALLOC ( parmesh,buffer,pack_size,char,"buffer to send",ier=0 );
buffer = &rcv_buffer[buf_idx];

/* Save input allocated address to avoid arrors at unalloc */
ptr_to_free = rcv_buffer;

#ifndef NDEBUG
/* Remark: in release mode, a non allocated buffer used in gatherv creates a
Expand All @@ -1174,17 +1181,40 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
}
#endif

ptr = buffer;
/* /!\ mpipack_parmesh and mpiunpack_parmesh are modifying the buffer pointer
* making it not valid for realloc / unalloc */

/* Save adress of buffer because it will be set the the end of the char array
by the \a PMMG_mpipack_parmesh function */
char *buffer_to_send = buffer;
ier_pack = PMMG_mpipack_parmesh ( parmesh ,&buffer );

/* Do not use \a buffer pointer after this call: it points toward the end of
* the packed array which is useless */
buffer = NULL;

assert ( ier_pack );

/* Gather the packed parmeshes */
ier = MG_MIN ( ier, ier_pack );
MPI_CHECK( MPI_Gatherv ( ptr,pack_size,MPI_CHAR,
rcv_buffer,rcv_pack_size,
displs,MPI_CHAR,root,parmesh->comm ),ier=0 );

PMMG_DEL_MEM(parmesh,ptr,char,"buffer to send");
/* Here the gatherv call has been replaced by a send/recv to avoid errors when
* displacements overflow the INT_MAX value */
if (parmesh->myrank == root) {
int i;
for ( i = 0; i < nprocs; ++i ) {
if ( i != root ) {
MPI_CHECK(
MPI_Recv(rcv_buffer + displs[i], rcv_pack_size[i], MPI_CHAR, i,
MPI_MERGEMESH_TAG, parmesh->comm, MPI_STATUS_IGNORE),
ier = 0);
}
}
} else {
MPI_CHECK(
MPI_Send(buffer_to_send, pack_size, MPI_CHAR, root, MPI_MERGEMESH_TAG,parmesh->comm),
ier = 0);
}

/** 4: Unpack parmeshes */
#ifndef NDEBUG
Expand All @@ -1195,7 +1225,6 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
#endif

if ( parmesh->myrank == root ) {
ptr = rcv_buffer;
for ( k=0; k<nprocs; ++k ) {
ier_pack = PMMG_mpiunpack_parmesh ( parmesh,(*rcv_grps),k,(*rcv_int_node_comm)+k,
(*rcv_next_node_comm)+k,(*rcv_ext_node_comm)+k,
Expand All @@ -1208,7 +1237,9 @@ int PMMG_gather_parmesh( PMMG_pParMesh parmesh,
/* Free temporary arrays */
PMMG_DEL_MEM(parmesh,rcv_pack_size,int,"rcv_pack_size");
PMMG_DEL_MEM(parmesh,displs,int,"displs");
PMMG_DEL_MEM(parmesh,ptr ,char,"rcv_buffer");
/* the address of rcv_buffer is modified by packing/unpacking so it is needed
* to send the initially allocated address stored in to the unalloc macro */
PMMG_DEL_MEM(parmesh,ptr_to_free,char,"rcv_buffer");

return ier;
}
Expand Down
2 changes: 1 addition & 1 deletion src/mpi_pmmg.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
#define MPI_TRANSFER_GRP_TAG 8000
#define MPI_COMMUNICATORS_REF_TAG 9000
#define MPI_ANALYS_TAG 10000

#define MPI_MERGEMESH_TAG 11000

#define MPI_CHECK(func_call,on_failure) do { \
int mpi_ret_val; \
Expand Down
2 changes: 2 additions & 0 deletions src/mpipack_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,8 @@ int PMMG_mpipack_grp ( PMMG_pGrp grp,char **buffer ) {
* buffer pointer at the end of the written area. The parmesh groups must have
* been merged before entering this function.
*
* \remark the \a buffer pointer is modified (shifted) thus, after this
* function, it cannot be used for deallocation anymore
*/
int PMMG_mpipack_parmesh ( PMMG_pParMesh parmesh ,char **buffer ) {
PMMG_pGrp grp;
Expand Down
3 changes: 3 additions & 0 deletions src/mpiunpack_pmmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1112,6 +1112,9 @@ int PMMG_mpiunpack_grp ( PMMG_pParMesh parmesh,PMMG_pGrp listgrp,int igrp,char *
* pointer toward a buffer of type "x". Then we can get the variable value by
* dereferencing the adress of the buffer.
*
* \remark the \a buffer pointer is modified (shifted) thus, after this
* function, it cannot be used for deallocation anymore
*
*/
int PMMG_mpiunpack_parmesh ( PMMG_pParMesh parmesh,PMMG_pGrp listgrp,int igrp,
PMMG_pInt_comm int_node_comm,
Expand Down

0 comments on commit 5cbeb4d

Please sign in to comment.