Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: #48: vt and magistrate support #49

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ set_property(TARGET resilience PROPERTY CXX_STANDARD ${Kokkos_CXX_STANDARD})
target_link_libraries(resilience PUBLIC Kokkos::kokkos)

option(KR_ENABLE_VELOC "use VeloC backend for automatic checkpointing" ON)
option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" ON)
option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" OFF)

option(KR_ENABLE_MAGISTRATE "use Magistrate for serializing and deserializing" OFF)
option(KR_ENABLE_RESILIENT_EXEC "enable resilient execution spaces" OFF)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah just noticed this is wrong, see KR_ENABLE_EXEC_SPACES below. I think I messed this up in the rebase somewhere


option(KR_ENABLE_VT "use VT for backend coordination" OFF)

include(CMakeDependentOption)

Expand All @@ -55,6 +60,14 @@ if (KR_ENABLE_VELOC)
endif()
endif()

if (KR_ENABLE_VT)
find_package(vt REQUIRED)
target_link_libraries(resilience PUBLIC vt::runtime::vt)
target_compile_definitions(resilience PUBLIC KR_ENABLE_VT)

set(KR_ENABLE_MAGISTRATE ON)
endif()

# StdFile backend
if (KR_ENABLE_STDFILE)
target_compile_definitions(resilience PUBLIC KR_ENABLE_STDFILE)
Expand All @@ -70,6 +83,12 @@ if (KR_ENABLE_TRACING)
target_compile_definitions(resilience PUBLIC KR_ENABLE_TRACING)
endif()

if (KR_ENABLE_MAGISTRATE)
find_package(checkpoint REQUIRED)
target_link_libraries(resilience PUBLIC vt::lib::checkpoint)
target_compile_definitions(resilience PUBLIC KR_ENABLE_MAGISTRATE)
endif()

option( KR_ENABLE_STDIO "use stdio for manual checkpoint" OFF )
option( KR_ENABLE_HDF5 "add HDF5 support" OFF )
option( KR_ENABLE_HDF5_PARALLEL "use parallel version of HDF5" OFF )
Expand Down
8 changes: 7 additions & 1 deletion cmake/resilienceConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ SET(KR_ENABLE_HDF5 @KR_ENABLE_HDF5@)
SET(KR_ENABLE_VELOC @KR_ENABLE_VELOC@)

# VeloC needs to add a cmake config...
LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/Modules/")
LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../Modules/" "${CMAKE_CURRENT_LIST_DIR}/../cmake/Modules/")
message(STATUS "Module path: ${CMAKE_MODULE_PATH}")

find_dependency(Kokkos REQUIRED NO_CMAKE_PACKAGE_REGISTRY HINTS @Kokkos_DIR@)
Expand All @@ -25,5 +25,11 @@ if (@KR_ENABLE_HDF5@)
find_dependency(HDF5 REQUIRED)
endif()

if (@KR_ENABLE_MAGISTRATE@)
set(CHECKPOINT_DIR @CHECKPOINT_DIR@)
find_dependency(checkpoint REQUIRED)
set(KR_ENABLE_MAGISTRATE @KR_ENABLE_MAGISTRATE@)
endif()

set(Boost_DIR @Boost_DIR@)
find_dependency(Boost REQUIRED)
19 changes: 14 additions & 5 deletions examples/SimpleCheckpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@

#include <mpi.h>
#include <Kokkos_Core.hpp>
#include <resilience/Context.hpp>
#include <resilience/veloc/VelocBackend.hpp>
#include <resilience/AutomaticCheckpoint.hpp>
#include <resilience/Resilience.hpp>

using chkpt_view = Kokkos::Experimental::SubscribableViewHooks<KokkosResilience::DynamicViewHooksSubscriber>;

int
main( int argc, char **argv )
Expand All @@ -60,15 +60,24 @@ main( int argc, char **argv )
auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config.json" );

int dim0 = 5, dim1 = 5;
auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 );
auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 );

KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() {
Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) {
for ( int j = 0; j < dim1; ++j )
view( i, j ) = 3.0;
} );
} );
});

for(int i = 0; i < dim0; i++){
for(int j = 0; j < dim1; j++){
if(view(i,j) != 3.0) {
fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0);
exit(1);
}
}
}
printf("Success!\n");
}
Kokkos::finalize();

Expand Down
27 changes: 21 additions & 6 deletions examples/SimpleFileCheckpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,42 @@
#endif

#include <Kokkos_Core.hpp>
#include <resilience/Context.hpp>
#include <resilience/stdfile/StdFileBackend.hpp>
#include <resilience/AutomaticCheckpoint.hpp>
#include <resilience/Resilience.hpp>
#include <mpi.h>

using chkpt_view = Kokkos::Experimental::SubscribableViewHooks<KokkosResilience::DynamicViewHooksSubscriber>;

int
main( int argc, char **argv )
{
MPI_Init( &argc, &argv );

Kokkos::initialize( argc, argv );
{
auto ctx = KokkosResilience::make_context( "checkpoint.data", "config_file.json" );
auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config_file.json" );

int dim0 = 5, dim1 = 5;
auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 );
auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 );

KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() {
Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) {
for ( int j = 0; j < dim1; ++j )
view( i, j ) = 3.0;
} );
} );
}, [](int){return true;} );

for(int i = 0; i < dim0; i++){
for(int j = 0; j < dim1; j++){
if(view(i,j) != 3.0) {
fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0);
exit(1);
}
}
}
printf("Success!\n");

}
Kokkos::finalize();

MPI_Finalize();
}
2 changes: 1 addition & 1 deletion examples/benchmark_multiviews.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) {
wtime = MPI_Wtime();
std::size_t i = 1 + KokkosResilience::latest_version(*ctx, "test_kokkos");

while(i < nsteps) {
while(i < nsteps ) {

KokkosResilience::checkpoint(*ctx, "test_kokkos", i, [=]() { // Nic, tell me what should I put for []/

Expand Down
5 changes: 3 additions & 2 deletions examples/config_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
"backend": "stdfile",
"backends": {
"stdfile": {
"config": "file_test.cfg"
"directory": "./stdfile_chkpts/",
"filename_prefix": "simple_"
}
},
"filter": {
"type": "time",
"interval": 10
}
}
}
Loading