diff --git a/CMakeLists.txt b/CMakeLists.txt index 3204bae..fd623e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,15 @@ set_property(TARGET resilience PROPERTY CXX_STANDARD ${Kokkos_CXX_STANDARD}) target_link_libraries(resilience PUBLIC Kokkos::kokkos) option(KR_ENABLE_VELOC "use VeloC backend for automatic checkpointing" ON) -option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" ON) +option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" OFF) + +option(KR_ENABLE_MAGISTRATE "use Magistrate for serializing and deserializing" OFF) +option(KR_ENABLE_RESILIENT_EXEC "enable resilient execution spaces" OFF) + +option(KR_ENABLE_VT "use VT for backend coordination" OFF) + +option(KR_ENABLE_OPENMP "enable the resilient OpenMP execution space" OFF) +option(KR_ENABLE_CUDA "enable the resilient CUDA execution space" OFF) include(CMakeDependentOption) @@ -55,12 +63,23 @@ if (KR_ENABLE_VELOC) endif() endif() +if (KR_ENABLE_VT) + find_package(vt REQUIRED) + target_link_libraries(resilience PUBLIC vt::runtime::vt) + target_compile_definitions(resilience PUBLIC KR_ENABLE_VT) + + set(KR_ENABLE_MPI_BACKENDS ON) + set(KR_ENABLE_MAGISTRATE ON) +endif() + # StdFile backend if (KR_ENABLE_STDFILE) target_compile_definitions(resilience PUBLIC KR_ENABLE_STDFILE) endif() if (KR_ENABLE_MPI_BACKENDS) + find_package(MPI REQUIRED) + target_link_libraries(resilience PRIVATE MPI::MPI_CXX) target_compile_definitions(resilience PUBLIC KR_ENABLE_MPI_BACKENDS) endif() @@ -70,6 +89,12 @@ if (KR_ENABLE_TRACING) target_compile_definitions(resilience PUBLIC KR_ENABLE_TRACING) endif() +if (KR_ENABLE_MAGISTRATE) + find_package(checkpoint REQUIRED) + target_link_libraries(resilience PUBLIC vt::lib::checkpoint) + target_compile_definitions(resilience PUBLIC KR_ENABLE_MAGISTRATE) +endif() + option( KR_ENABLE_STDIO "use stdio for manual checkpoint" OFF ) option( KR_ENABLE_HDF5 "add HDF5 support" OFF ) option( KR_ENABLE_HDF5_PARALLEL "use parallel version of HDF5" OFF ) diff --git a/CMakePresets.json b/CMakePresets.json index 20303da..f655709 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -64,7 +64,8 @@ "KR_ENABLE_TESTS": "ON", "KR_ENABLE_EXAMPLES": "ON", "KR_ALL_WARNINGS": "ON", - "KR_WARNINGS_AS_ERRORS": "ON" + "KR_WARNINGS_AS_ERRORS": "ON", + "KR_ENABLE_STDFILE": "OFF" } } ], diff --git a/cmake/resilienceConfig.cmake.in b/cmake/resilienceConfig.cmake.in index 4c904a9..ae4b3db 100644 --- a/cmake/resilienceConfig.cmake.in +++ b/cmake/resilienceConfig.cmake.in @@ -7,9 +7,10 @@ include("${CMAKE_CURRENT_LIST_DIR}/resilienceTargets.cmake") SET(KR_ENABLE_HDF5 @KR_ENABLE_HDF5@) SET(KR_ENABLE_VELOC @KR_ENABLE_VELOC@) +SET(KR_ENABLE_MAGISTRATE @KR_ENABLE_MAGISTRATE@) # VeloC needs to add a cmake config... -LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/Modules/") +LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../Modules/" "${CMAKE_CURRENT_LIST_DIR}/../cmake/Modules/") message(STATUS "Module path: ${CMAKE_MODULE_PATH}") find_dependency(Kokkos REQUIRED NO_CMAKE_PACKAGE_REGISTRY HINTS @Kokkos_DIR@) @@ -25,5 +26,11 @@ if (@KR_ENABLE_HDF5@) find_dependency(HDF5 REQUIRED) endif() +if (@KR_ENABLE_MAGISTRATE@) + set(CHECKPOINT_DIR @CHECKPOINT_DIR@) + find_dependency(checkpoint REQUIRED) + set(KR_ENABLE_MAGISTRATE @KR_ENABLE_MAGISTRATE@) +endif() + set(Boost_DIR @Boost_DIR@) find_dependency(Boost REQUIRED) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 092b6d0..229dc74 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,9 +13,15 @@ function(add_example _target) target_resources(${_target} PRIVATE ${ARG_RESOURCES}) target_link_libraries(${_target} PRIVATE Kokkos::resilience) target_link_libraries(${_target} PRIVATE cxxopts::cxxopts) - if (KR_ENABLE_VELOC OR KR_ENABLE_HDF5_PARALLEL) + if (KR_ENABLE_MPI_BACKENDS) target_link_libraries(${_target} PRIVATE MPI::MPI_CXX) endif() + if (KR_ENABLE_MAGISTRATE) + target_link_libraries(${_target} PRIVATE vt::lib::checkpoint) + endif() + if (KR_ENABLE_VT) + target_link_libraries(${_target} PRIVATE vt::runtime::vt) + endif() if (KR_WARNINGS_AS_ERRORS) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") @@ -42,3 +48,8 @@ if (KR_ENABLE_STDFILE) add_example(simple_file_checkpoint SOURCES SimpleFileCheckpoint.cpp RESOURCES config_file.json file_test.cfg) endif() + +if(KR_ENABLE_VT) + add_example(jacobi_checkpoint SOURCES jacobi/main.cpp jacobi/solver.cpp jacobi/config.cpp + RESOURCES jacobi/config_jacobi.json jacobi/config_jacobi_async.json jacobi/config_jacobi_1.json jacobi/config_jacobi_more_async.json) +endif() diff --git a/examples/SimpleCheckpoint.cpp b/examples/SimpleCheckpoint.cpp index 1a811db..f1a958c 100644 --- a/examples/SimpleCheckpoint.cpp +++ b/examples/SimpleCheckpoint.cpp @@ -46,9 +46,9 @@ #include #include -#include -#include -#include +#include + +using chkpt_view = Kokkos::Experimental::SubscribableViewHooks; int main( int argc, char **argv ) @@ -60,15 +60,24 @@ main( int argc, char **argv ) auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config.json" ); int dim0 = 5, dim1 = 5; - auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 ); + auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 ); KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() { Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) { for ( int j = 0; j < dim1; ++j ) view( i, j ) = 3.0; } ); - } ); + }); + for(int i = 0; i < dim0; i++){ + for(int j = 0; j < dim1; j++){ + if(view(i,j) != 3.0) { + fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0); + exit(1); + } + } + } + printf("Success!\n"); } Kokkos::finalize(); diff --git a/examples/SimpleFileCheckpoint.cpp b/examples/SimpleFileCheckpoint.cpp index de2b6a1..7e1dc97 100644 --- a/examples/SimpleFileCheckpoint.cpp +++ b/examples/SimpleFileCheckpoint.cpp @@ -46,27 +46,42 @@ #endif #include -#include -#include -#include +#include +#include + +using chkpt_view = Kokkos::Experimental::SubscribableViewHooks; int main( int argc, char **argv ) { + MPI_Init( &argc, &argv ); + Kokkos::initialize( argc, argv ); { - auto ctx = KokkosResilience::make_context( "checkpoint.data", "config_file.json" ); + auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config_file.json" ); int dim0 = 5, dim1 = 5; - auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 ); + auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 ); KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() { Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) { for ( int j = 0; j < dim1; ++j ) view( i, j ) = 3.0; } ); - } ); + }, [](int){return true;} ); + + for(int i = 0; i < dim0; i++){ + for(int j = 0; j < dim1; j++){ + if(view(i,j) != 3.0) { + fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0); + exit(1); + } + } + } + printf("Success!\n"); } Kokkos::finalize(); + + MPI_Finalize(); } diff --git a/examples/benchmark_multiviews.cpp b/examples/benchmark_multiviews.cpp index 8c622c5..2491193 100644 --- a/examples/benchmark_multiviews.cpp +++ b/examples/benchmark_multiviews.cpp @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { wtime = MPI_Wtime(); std::size_t i = 1 + KokkosResilience::latest_version(*ctx, "test_kokkos"); - while(i < nsteps) { + while(i < nsteps ) { KokkosResilience::checkpoint(*ctx, "test_kokkos", i, [=]() { // Nic, tell me what should I put for []/ diff --git a/examples/config_file.json b/examples/config_file.json index f02e30d..2de7e28 100644 --- a/examples/config_file.json +++ b/examples/config_file.json @@ -2,11 +2,12 @@ "backend": "stdfile", "backends": { "stdfile": { - "config": "file_test.cfg" + "directory": "./stdfile_chkpts/", + "filename_prefix": "simple_" } }, "filter": { "type": "time", "interval": 10 } -} \ No newline at end of file +} diff --git a/examples/jacobi/config.cpp b/examples/jacobi/config.cpp new file mode 100644 index 0000000..f79b04a --- /dev/null +++ b/examples/jacobi/config.cpp @@ -0,0 +1,122 @@ +#include "config.hpp" +#include + +namespace Jacobi { +Config::Config(int argc, char** argv){ + for(int i = 0; i < argc; i++){ + std::string arg = argv[i]; + if( arg == "--decomp"){ + int x = std::stoi(argv[++i]); + int y = std::stoi(argv[++i]); + int z = std::stoi(argv[++i]); + colRange = vt::Index3D(x,y,z); + } else if(arg == "--input"){ + int x = std::stoi(argv[++i]); + int y = std::stoi(argv[++i]); + int z = std::stoi(argv[++i]); + dataRange = vt::Index3D(x,y,z); + } else if(arg == "--max-iters") { + maxIter = std::stoi(argv[++i]); + } else if(arg == "--tolerance") { + tolerance = std::stod(argv[++i]); + } else if(arg == "--async-serialize") { + asyncCheckpoint = true; + } + } + + /* --- Print information about the simulation */ + if(vt::theContext()->getNode() == 0){ + fmt::print( + stdout, "\n - Solve the linear system for the Laplacian with homogeneous Dirichlet" + " on [0, 1] x [0, 1] x [0, 1]\n" + ); + fmt::print(" - Second-order centered finite difference\n"); + fmt::print(" - {} elements decomposed onto {} objects.\n", dataRange.toString(), colRange.toString()); + fmt::print(" - Maximum number of iterations {}\n", maxIter); + fmt::print(" - Convergence tolerance {}\n", tolerance); + fmt::print("\n"); + } +} +} + +ResilienceConfig::ResilienceConfig(int argc, char** argv, Jacobi::Config app_cfg){ + for(int i = 0; i < argc; i++){ + std::string arg = argv[i]; + if(arg == "--config") + config_filename = argv[++i]; + else if(arg == "--mode") + context_mode = argv[++i]; + else if(arg == "--freq") + checkpoint_frequency = std::stoi(argv[++i]); + else if(arg == "--kill") + kill_iter = std::stoi(argv[++i]); + else if(arg == "--kill-rank") + kill_rank = std::stoi(argv[++i]); + else if(arg == "--iters-per-phase") + iters_per_phase = std::stoi(argv[++i]); + else if(arg == "--iters-per-epoch") + iters_per_epoch = std::stoi(argv[++i]); + } + + + if(context_mode == "VT") { + if(iters_per_epoch == 0) iters_per_epoch = -1; + context = kr::make_context(vt::theContext(), config_filename); + } else if(context_mode == "MPI"){ + if(iters_per_epoch == 0){ + iters_per_epoch = checkpoint_frequency; + //Can't infer both iters_per_epoch and checkpoint_frequency + assert(checkpoint_frequency != 0); + } + context = kr::make_context(MPI_COMM_WORLD, config_filename); + } else throw std::invalid_argument("Valid --mode values are VT or MPI"); + + std::string freq_str; + if(checkpoint_frequency < 0) { + freq_str = "never"; + checkpoint_filter = [](int iter){ return false; }; + } else if(checkpoint_frequency == 0){ + freq_str = "according to json"; + checkpoint_filter = context->default_filter(); + } else { + freq_str = fmt::format("every {} iterations", checkpoint_frequency); + checkpoint_filter = kr::Filter::NthIterationFilter(checkpoint_frequency); + } + + + if(iters_per_phase < 1) iters_per_phase = app_cfg.maxIter+1; + if(iters_per_epoch < 1) iters_per_epoch = app_cfg.maxIter+1; + + + if(vt::theContext()->getNode() == 0) { + fmt::print("kr:: {} Context configured against {}\n", context_mode, config_filename); + fmt::print("kr:: Checkpointing {}\n", freq_str); + if(kill_iter > 0 && kill_rank > 0){ + fmt::print("Generating failure at iteration {} on rank {}\n", kill_iter, kill_rank); + if(kill_rank >= vt::theContext()->getNumNodes()){ + fmt::print("WARNING: kill_rank {} does not exist!\n", kill_rank); + } + } + + if(iters_per_epoch == -1){ + fmt::print("kr:: instructing app not to bound iterations\n"); + } else { + fmt::print("kr:: instructing app to bound every {} iterations\n", iters_per_epoch); + } + + if(iters_per_phase == -1){ + fmt::print("kr:: instructing app not to use phases\n"); + } else { + fmt::print("kr:: instructing app to phase every {} iterations\n", iters_per_phase); + } + } +} + +void ResilienceConfig::try_kill(int current_iteration){ + if(kill_iter == current_iteration && + kill_rank == vt::theContext()->getNode()){ + fmt::print(stderr, "Rank {} simulating failure on iteration {}\n", + kill_rank, kill_iter); + exit(1); + } +}; diff --git a/examples/jacobi/config.hpp b/examples/jacobi/config.hpp new file mode 100644 index 0000000..ae727b2 --- /dev/null +++ b/examples/jacobi/config.hpp @@ -0,0 +1,91 @@ +#ifndef JACOBI_CONFIG_HPP +#define JACOBI_CONFIG_HPP + +#include +#include + +namespace Jacobi { +//Manage solver parameters +struct Config { + //Number of solver objects to decompose the work into. + // --decomp + vt::Index3D colRange = vt::Index3D(4,4,4); + + //Input size per solver object + // --input + vt::Index3D dataRange = vt::Index3D(50,50,50); + + //Solver stops running after either maxIter iterations, or + //once tolerance reached. + // --tolerance + // --max-iters + double tolerance = 1e-2; + int maxIter = 100; + + //Whether solver ought to manage asynchronous checkpointing manually. + // --async-serialize + bool asyncCheckpoint = false; + + Config() = default; + + template + void serialize(SerT& s){ + s | colRange | dataRange | tolerance | maxIter | asyncCheckpoint; + } + + Config(int argc, char** argv); +}; +} + +//Manage resilience parameters +struct ResilienceConfig { + //Path to JSON config file for KokkosResilience + // --config + std::string config_filename = "config_jacobi.json"; + + //Which type of context to use for consistency-enforcement + // --mode + std::string context_mode = "VT"; + + //How often to checkpoint, in iterations. + // --freq + // 0 = from config file + //-1 = never + int checkpoint_frequency = 0; + + //Where and when to insert a failure. + // --kill + int kill_iter = -1; + // --kill-rank + int kill_rank = 0; + + //How often we should start the next VT phase (requiring an epoch boundary) + // --iters-per-phase + int iters_per_phase = 30; + + //How often we should arbitrarily insert an epoch boundary to test w/ + //some forced synchrony or to ensure correctness with "MPI" context_mode + // --iters-per-epoch + // 0 = matching checkpoint_frequency if "MPI" context_mode, else never + //-1 = never + int iters_per_epoch = 0; + + //Tells context when to checkpoint. + std::function checkpoint_filter; + + ResilienceConfig(int argc, char** argv, Jacobi::Config app_cfg); + + //Test recovery by exiting if on correct iteration and rank + void try_kill(int current_iteration); + + //Enable treating this object just like you would the context unique_ptr + kr::ContextBase* operator->(){ return context.get(); } + void reset() { context.reset(); } + +private: + std::unique_ptr context; +}; + + + +#endif diff --git a/examples/jacobi/config_jacobi.json b/examples/jacobi/config_jacobi.json new file mode 100644 index 0000000..b61ad8f --- /dev/null +++ b/examples/jacobi/config_jacobi.json @@ -0,0 +1,18 @@ +{ + "backend": "stdfile", + "backends": { + "stdfile": { + "directory": "/projects/wg-lflr/runs/mwhitlo/kr/", + "filename_prefix": "jacobi_" + } + }, + "filter": { + "type": "iteration", + "interval": 6 + }, + "contexts": { + "vt": { + "max_iteration_offset": 0 + } + } +} diff --git a/examples/jacobi/config_jacobi_1.json b/examples/jacobi/config_jacobi_1.json new file mode 100644 index 0000000..2828435 --- /dev/null +++ b/examples/jacobi/config_jacobi_1.json @@ -0,0 +1,18 @@ +{ + "backend": "stdfile", + "backends": { + "stdfile": { + "directory": "/projects/wg-lflr/runs/mwhitlo/kr/", + "filename_prefix": "jacobi_" + } + }, + "filter": { + "type": "iteration", + "interval": 6 + }, + "contexts": { + "vt": { + "max_iteration_offset": 2 + } + } +} diff --git a/examples/jacobi/config_jacobi_async.json b/examples/jacobi/config_jacobi_async.json new file mode 100644 index 0000000..2a71317 --- /dev/null +++ b/examples/jacobi/config_jacobi_async.json @@ -0,0 +1,18 @@ +{ + "backend": "stdfile", + "backends": { + "stdfile": { + "directory": "/projects/wg-lflr/runs/mwhitlo/kr/", + "filename_prefix": "jacobi_" + } + }, + "filter": { + "type": "iteration", + "interval": 6 + }, + "contexts": { + "vt": { + "max_iteration_offset": 50 + } + } +} diff --git a/examples/jacobi/config_jacobi_more_async.json b/examples/jacobi/config_jacobi_more_async.json new file mode 100644 index 0000000..2622258 --- /dev/null +++ b/examples/jacobi/config_jacobi_more_async.json @@ -0,0 +1,18 @@ +{ + "backend": "stdfile", + "backends": { + "stdfile": { + "directory": "/projects/wg-lflr/runs/mwhitlo/kr/", + "filename_prefix": "jacobi_" + } + }, + "filter": { + "type": "iteration", + "interval": 6 + }, + "contexts": { + "vt": { + "max_iteration_offset": 149 + } + } +} diff --git a/examples/jacobi/main.cpp b/examples/jacobi/main.cpp new file mode 100644 index 0000000..80dc77c --- /dev/null +++ b/examples/jacobi/main.cpp @@ -0,0 +1,129 @@ +/* +//@HEADER +// ***************************************************************************** +// +// jacobi3d_vt.cc +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include +#include +#include +#include + +#include "config.hpp" +#include "solver.hpp" + +//Label for the resilience region. +const std::string MAIN_LOOP = "Jacobi main loop"; + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + vt::initialize(argc, argv); + const int this_node = vt::theContext()->getNode(); + + Jacobi::Config app_cfg(argc, argv); + ResilienceConfig res_cfg(argc, argv, app_cfg); + + int recover_iter = res_cfg->latest_version(MAIN_LOOP); + bool recovering = recover_iter >= 1; + if(recovering && this_node == 0) + fmt::print("Recovering to iteration {}\n", recover_iter); + + + //Object group of all nodes that take part in computation + // Used to determine whether the computation is finished + auto grp_proxy = vt::theObjGroup()->makeCollective("notify"); + + //Collection of Solver objects that perform the work. + auto col_proxy = vt::makeCollection("jacobi") + .bounds(app_cfg.colRange).bulkInsert().wait(); + if(!recovering) { + vt::runInEpochCollective([=]{ + col_proxy.broadcastCollective<&Jacobi::Solver::init>(app_cfg, grp_proxy); + }); + } + + //Register our objects, labels are pulled from the VT labels + res_cfg->register_to(MAIN_LOOP, grp_proxy); + res_cfg->register_to(MAIN_LOOP, col_proxy); + + + size_t iter = 1; + if(recovering) iter = recover_iter; + + const size_t max_iter = app_cfg.maxIter + 1; //Our iter is 1-based + + size_t next_phase_boundary = res_cfg.iters_per_phase + iter; + size_t next_epoch_boundary = res_cfg.iters_per_epoch + iter; + size_t next_boundary = std::min(std::min(next_phase_boundary, next_epoch_boundary), max_iter); + + while (!isWorkDone(grp_proxy) && iter < max_iter) { + vt::runInEpochCollective(fmt::format("Jacobi iters [{}-{}]", iter, next_boundary-1), [&]{ + if(this_node == 0) fmt::print(stderr, "Running iterations [{},{}]\n", iter, next_boundary-1); + + for( ; iter < next_boundary && !isWorkDone(grp_proxy); iter++){ + res_cfg->run(MAIN_LOOP, iter, [&]{ + res_cfg->register_to_active(col_proxy); + res_cfg->register_to_active(grp_proxy); + + col_proxy.broadcastCollective<&Jacobi::Solver::iterate>(); + }, res_cfg.checkpoint_filter); + } + }); + + vt::runInEpochCollective(fmt::format("Jacobi reduce {}", iter), [&]{ + col_proxy.broadcastCollective<&Jacobi::Solver::reduce>(); + }); + + //Update boundaries as necessary. + if(iter == next_epoch_boundary) next_epoch_boundary += res_cfg.iters_per_epoch; + if(iter == next_phase_boundary) { + next_phase_boundary += res_cfg.iters_per_phase; + vt::thePhase()->nextPhaseCollective(); + } + next_boundary = std::min(std::min(next_phase_boundary, next_epoch_boundary), max_iter); + } + + vt::finalize(); + } + Kokkos::finalize(); +} + + diff --git a/examples/jacobi/solver.cpp b/examples/jacobi/solver.cpp new file mode 100644 index 0000000..de1d7f5 --- /dev/null +++ b/examples/jacobi/solver.cpp @@ -0,0 +1,233 @@ +#include "solver.hpp" +namespace Jacobi { +bool Detector::isWorkFinished(){return finished;} +void Detector::workFinished(){finished = true;} +bool isWorkDone( DetectorProxy const& proxy) { + return proxy.get()->isWorkFinished(); +}; + +void Solver::init(Config cfg_, DetectorProxy detector_){ + cfg = cfg_; + detector = detector_; + + auto idx = getIndex(); + for(int dim = 0; dim < 3; dim++){ + nElms[dim] = cfg.dataRange[dim]/cfg.colRange[dim]; + if(idx[dim] < (cfg.dataRange[dim]%cfg.colRange[dim])) + nElms[dim]++; + if(nElms[dim] <= 1){ + fmt::print(stderr, "{} running with only {} elements in dimension {}\n", getIndex().toString(), nElms[dim], dim); + assert(nElms[dim] > 0); + } + + if(idx[dim] == 0) nNeighbors--; + if(idx[dim] == cfg.colRange[dim]-1) nNeighbors--; + } + + + //previous/result views are swapped, so keep labels generic. + previous = Kokkos::View("ViewA", nElms[0], nElms[1], nElms[2]); + result = Kokkos::View("ViewB", nElms[0], nElms[1], nElms[2]); + rhs = Kokkos::View("RHS", nElms[0], nElms[1], nElms[2]); + Kokkos::deep_copy(previous, 0); + Kokkos::deep_copy(result, 0); + Kokkos::deep_copy(rhs, 0); + + + // + // Set the initial vector to the values of + // a "high-frequency" function + // + double hx = 1.0 / (cfg.dataRange.x()+1); + double hy = 1.0 / (cfg.dataRange.y()+1); + double hz = 1.0 / (cfg.dataRange.z()+1); + + int maxDim = std::max(std::max(cfg.dataRange[0], cfg.dataRange[1]), cfg.dataRange[2]); + int nf = 3 * int(maxDim+1) / 6; + + std::array offsets; + for(int dim = 0; dim < 3; dim++){ + offsets[dim] = idx[dim] * (cfg.dataRange[dim]/cfg.colRange[dim]); + offsets[dim] += std::min(idx[dim], (cfg.dataRange[dim]%cfg.colRange[dim])); + } + + Kokkos::parallel_for(Range({1,1,1}, {nElms[0]-1, nElms[1]-1, nElms[2]-1}), + KOKKOS_LAMBDA (const int x, const int y, const int z){ + double val = pow((offsets[0]+x)*hx, 2); + val += pow((offsets[1]+y)*hy, 2); + val += pow((offsets[2]+z)*hz, 2); + + result(x,y,z) = sin(nf * M_PI * val); + }); +} + + +void Solver::iterate(){ + //Grab the last iteration's epoch + vt::EpochType predEpoch = epochQueue.empty() ? vt::no_epoch : epochQueue.back(); + + //Make an epoch for this iteration, and add to the queue. + vt::EpochType iterEpoch = vt::theTerm()->makeEpochRooted( + fmt::format("{} iteration {}", getIndex().toString(), nextLaunchIter) + ); + epochQueue.push_back(iterEpoch); + + if(predEpoch == vt::no_epoch){ + //Just launch up the next iteration + _iterate(); + } else { + //Wait until prior iteration finishes to launch this one. + + //Grab up current epoch so this iteration's messages are correctly assigned + vt::EpochType parentEpoch = vt::theTerm()->getEpoch(); + vt::theTerm()->addLocalDependency(parentEpoch); + + //Add an action to run once prior iteration finishes + vt::theTerm()->addAction(predEpoch, [this, parentEpoch]{ + vt::theTerm()->pushEpoch(parentEpoch); + getCollectionProxy()[getIndex()].send<&Solver::_iterate>(); + vt::theTerm()->popEpoch(parentEpoch); + + vt::theTerm()->releaseLocalDependency(parentEpoch); + }); + + //Make the epoch dependency explicit, so VT can reduce termination detection messages. + vt::theTerm()->addDependency(predEpoch, iterEpoch); + } +} + +void Solver::_iterate() { + iter++; + + //Early recvs are just recvs now + nRecv = nEarlyRecv; + nEarlyRecv = 0; + + //Swap previous and result, will overwrite result w/ new + std::swap(result, previous); + + //Send ghost values to neighbors + auto proxy = getCollectionProxy(); + auto idx = getIndex(); + + for(int dim = 0; dim < 3; dim++){ + std::array dir = {0,0,0}; + + if(idx[dim] > 0){ + dir[dim] = -1; + proxy[idx + dir].send<&Solver::exchange>(idx, getPlane(previous, dir), iter); + } + if(idx[dim] < cfg.colRange[dim]-1){ + dir[dim] = 1; + proxy[idx + dir].send<&Solver::exchange>(idx, getPlane(previous, dir), iter); + } + } + + //We may have gotten all our ghost values while still working on last iteration + if(nRecv == nNeighbors) compute(); +}; + +void Solver::reduce() { + using ValT = typename Kokkos::MinMax::value_type; + ValT minMax; + + Kokkos::parallel_reduce(Range({1,1,1}, {nElms[0]-1, nElms[1]-1, nElms[2]-1}), + KOKKOS_LAMBDA (const int x, const int y, const int z, ValT& l_minMax){ + auto& val = result(x,y,z); + l_minMax.min_val = std::min(l_minMax.min_val, val); + l_minMax.max_val = std::max(l_minMax.max_val, val); + }, Kokkos::MinMax(minMax)); + + double max = std::max(minMax.min_val*-1, minMax.max_val); + + auto proxy = getCollectionProxy(); + proxy.reduce<&Solver::checkCompleted, vt::collective::MaxOp>(proxy(0,0,0), max); +}; + +void Solver::checkCompleted(double maxNorm) { + bool within_tolerance = maxNorm < cfg.tolerance; + bool timed_out = iter == cfg.maxIter; + bool done = within_tolerance || timed_out; + + if(done){ + if(within_tolerance) + fmt::print("\n # Jacobi reached tolerance threshold ({}<{}) in {} iterations\n\n", maxNorm, cfg.tolerance, iter); + else if(timed_out) + fmt::print("\n # Jacobi reached maximum iterations ({}) while above tolerance ({}>{})\n\n", iter, maxNorm, cfg.tolerance); + detector.broadcast<&Detector::workFinished>(); + } else { + fmt::print(" # Iteration {} reached with maxNorm {}\n", iter, maxNorm); + } +}; + +void Solver::exchange(vt::Index3D sender, Kokkos::View ghost, int in_iter) { + bool early = in_iter != iter; + if(early) assert(in_iter == iter+1); + + vt::Index3D dir = sender - getIndex(); + auto dest = getGhostPlane(early ? result : previous, dir); + Kokkos::deep_copy(dest, ghost); + + if(early) nEarlyRecv++; + else nRecv++; + + if(!early && nRecv == nNeighbors){ + auto iter_epoch = epochQueue.front(); + + vt::theTerm()->pushEpoch(iter_epoch); + getCollectionProxy()[getIndex()].send<&Solver::compute>(); + vt::theTerm()->popEpoch(iter_epoch); + + vt::theTerm()->finishedEpoch(iter_epoch); + } +} + +void Solver::compute() { + // + //---- Jacobi iteration step for + //---- A banded matrix for the 8-point stencil + //---- [ 0.0 -1.0 0.0] + //---- [-1.0] + //---- [-1.0 6.0 -1.0] + //---- [-1.0] + //---- [ 0.0 -1.0 0.0] + //---- rhs_ right hand side vector + // + Kokkos::parallel_for(Range({1,1,1}, {nElms[0]-1, nElms[1]-1, nElms[2]-1}), + KOKKOS_LAMBDA (const int x, const int y, const int z){ + result(x,y,z) = (1.0/6.0) * ( + rhs(x,y,x) + previous(x-1,y,z) + previous(x+1,y,z) + + previous(x,y-1,z) + previous(x,y+1,z) + previous(x,y,z-1) + + previous(x,y,z+1)); + }); + + //No longer waiting on this iteration + assert(!epochQueue.empty()); + epochQueue.pop_front(); +}; + +Kokkos::View +Solver::getGhostPlane(Kokkos::View in, vt::Index3D dir){ + return getPlane(in, dir, true); +} + +Kokkos::View +Solver::getPlane(Kokkos::View in, vt::Index3D dir, bool ghost){ + using Dir = vt::Index3D; + if(dir == Dir(-1,0,0)) + return Kokkos::subview(in, ghost ? 0 : 1, Kokkos::ALL(), Kokkos::ALL()); + if(dir == Dir(1,0,0)) + return Kokkos::subview(in, ghost ? nElms[0]-1 : nElms[0]-2, Kokkos::ALL(), Kokkos::ALL()); + if(dir == Dir(0,-1,0)) + return Kokkos::subview(in, Kokkos::ALL(), ghost ? 0 : 1, Kokkos::ALL()); + if(dir == Dir(0,1,0)) + return Kokkos::subview(in, Kokkos::ALL(), ghost ? nElms[0]-1 : nElms[0]-2, Kokkos::ALL()); + if(dir == Dir(0,0,-1)) + return Kokkos::subview(in, Kokkos::ALL(), Kokkos::ALL(), ghost ? 0 : 1); + if(dir == Dir(0,0,1)) + return Kokkos::subview(in, Kokkos::ALL(), Kokkos::ALL(), ghost ? nElms[0]-1 : nElms[0]-2); + + assert(false); + return Kokkos::subview(in, 0, Kokkos::ALL(), Kokkos::ALL()); +} +} diff --git a/examples/jacobi/solver.hpp b/examples/jacobi/solver.hpp new file mode 100644 index 0000000..996dab5 --- /dev/null +++ b/examples/jacobi/solver.hpp @@ -0,0 +1,144 @@ +#ifndef JACOBI_SOLVER_HPP +#define JACOBI_SOLVER_HPP + +#include +#include +#include "config.hpp" + +// +// This code applies a few steps of the Jacobi iteration to +// the linear system A x = 0 +// where is a banded symmetric positive definite matrix. +// The initial guess for x is a made-up non-zero vector. +// The exact solution is the vector 0. +// +// The matrix A is square and invertible. +// The number of rows is ((number of objects) * (number of rows per object)) +// +// Such a matrix A is obtained when using 2nd-order finite difference +// for discretizing +// +// -d^2 u / dx^2 -d^2 u / dy^2 - -d^2 u / dz^2 = f on [0, 1] x [0, 1] x [0, 1] +// +// with homogeneous Dirichlet condition +// +// u = 0 on the boundary of [0, 1] x [0, 1] x [0, 1] +// +// using a uniform grid with grid size +// +// 1 / ((number of objects) * (number of rows per object) + 1) +// + +namespace Jacobi { + +struct Detector { + bool finished = false; + + template + void serialize(Serializer& s) { + s | finished; + } + + bool isWorkFinished(); + void workFinished(); +}; +using DetectorProxy = vt::objgroup::proxy::Proxy; + +bool isWorkDone( DetectorProxy const& proxy); + +struct Solver : vt::Collection { + Config cfg; + DetectorProxy detector; + + //Lower for edges + int nNeighbors = 6; + + std::array nElms; + + //Previous iteration's data, current result, input right hand side. + Kokkos::View previous, result, rhs; + + //iter tracks the iteration locally completed or currently in progress. + //nextLaunchIter is just used for sanity checking. + int iter = 0, nextLaunchIter = 0; + std::deque epochQueue; + + //Count ghost messages received. We might get some "early" messages for our next iteration + int nRecv = 0, nEarlyRecv = 0; + + //3D range Kokkos execution policy + using Range = Kokkos::MDRangePolicy, Kokkos::IndexType>; + +public: + Solver() = default; + + template + void serialize(Serializer& s) { + vt::EpochType chkpt_epoch = vt::no_epoch; + if(s.hasTraits(vt::vrt::CheckpointInternalTrait()) && cfg.asyncCheckpoint && !s.isSizing()){ + //Checkpointing waits for enqueued iterations to finish + if(!epochQueue.empty()) { + vt::EpochType last_iter_epoch = epochQueue.back(); + + chkpt_epoch = vt::theTerm()->makeEpochRooted(fmt::format("{} checkpointing!\n", getIndex().toString())); + vt::theTerm()->addDependency(last_iter_epoch, chkpt_epoch); + epochQueue.push_back(chkpt_epoch); + + kr::Util::VT::delaySerializeUntil(last_iter_epoch); + } + } + + vt::trace::TraceScopedNote trace_obj( + fmt::format("{} {}@{}", s.isSizing()?"Sizing":"Serializing", getIndex().toString(), iter), + kr::Context::VT::VTContext::serialize_proxy + ); + + vt::Collection::serialize(s); + s | cfg | detector | nNeighbors | nElms | previous | result | rhs | iter; + trace_obj.end(); + + if(chkpt_epoch != vt::no_epoch) { + epochQueue.pop_front(); + vt::theTerm()->finishedEpoch(chkpt_epoch); + } + } + + + void init(Config cfg_, DetectorProxy detector_); + + //Requests another iteration be launched. + //Manages waiting on any outstanding iterations to finish. + void iterate(); + + //Internal. Perform the actual iteration steps. + void _iterate(); + + //Reduce the global error. Not currently asynchronously safe. + void reduce(); + + //Internal. Gets reduced global error and notifies of completion if finished. + void checkCompleted(double maxNorm); + + //Internal. Handles incoming ghost values. + void exchange(vt::Index3D sender, Kokkos::View ghost, int in_iter); + +private: + void compute(); + + //Get a Kokkos subview of the edge plane of input view. + // Which edge to get is defined by dir, which should have a single non-zero dimension + // Choose low or high edge of that dimension with a -1 or 1 value. + //(Essentially, dir=neighborIndex-myIndex gives you the ghost plane in the direction of neighbor) + Kokkos::View + getGhostPlane(Kokkos::View in, vt::Index3D dir); + + //As getGhostPlane, but by default gets the edge plane of local data not the ghost values. + Kokkos::View + getPlane(Kokkos::View in, vt::Index3D dir, bool ghost = false); +}; + +using SolverProxy = vt::vrt::collection::CollectionProxy; + +} + +#endif diff --git a/src/resilience/AutomaticCheckpoint.hpp b/src/resilience/AutomaticCheckpoint.hpp index eb608a2..c761a1f 100644 --- a/src/resilience/AutomaticCheckpoint.hpp +++ b/src/resilience/AutomaticCheckpoint.hpp @@ -50,134 +50,157 @@ #include #include "view_hooks/ViewHolder.hpp" #include "view_hooks/DynamicViewHooks.hpp" +#include "registration/ViewHolder.hpp" + +#include "context/ContextBase.hpp" -#include "Cref.hpp" #include "CheckpointFilter.hpp" // Tracing support -#ifdef KR_ENABLE_TRACING #include "util/Trace.hpp" #include -#endif -// Workaround for C++ < 17 -#define KR_CHECKPOINT_THIS _kr_self = *this -#define KR_CHECKPOINT( x ) _kr_chk_##x = kr::check_ref< int >( x, #x ) +#define KR_CHECKPOINT(x) KokkosResilience::Detail::RegInfo(x, std::string(#x)) +//#define KR_REGISTER( context, x ) _kr_chk_##x = kr::check_ref( #x, x ) namespace KokkosResilience { - template< typename Context > int latest_version( Context &ctx, const std::string &label ) { - return ctx.latest_version( label ); } - namespace Detail + template + void ContextBase::detect_and_register(F&& fun, Detail::RegInfo... explicit_members){ + using namespace Util; +#ifdef KR_ENABLE_TRACING + auto reg_hashes = begin_trace( *this, "register" ); +#endif + + //Gather up the explicitly-listed members. + (register_to_active(explicit_members), ...); + + //Enable ViewHolder copy constructor hooks to register the views + KokkosResilience::DynamicViewHooks::copy_constructor_set.set_callback( + [ctx = this](const KokkosResilience::ViewHolder &view) { + ctx->register_to_active(view); + } + ); + + //Copy the lambda/functor to trigger copy-constructor hooks + using FuncType = typename std::remove_reference::type; + [[maybe_unused]] FuncType f = fun; + + //Disable ViewHolder hook + KokkosResilience::DynamicViewHooks::copy_constructor_set.reset(); + +#ifdef KR_ENABLE_TRACING + reg_hashes.end(); +#endif + } + + template + void ContextBase::run(const std::string &label, int iteration, RegionFunc&& fun, + FilterFunc&& filter, Detail::RegInfo&... explicit_members) { - template< typename Context, typename F, typename FilterFunc > - void checkpoint_impl( Context &ctx, const std::string &label, int iteration, F &&fun, FilterFunc &&filter ) - { - // Trace if enabled - #ifdef KR_ENABLE_TRACING - std::ostringstream oss; - oss << "checkpoint_" << label; - auto chk_trace = Util::begin_trace< Util::IterTimingTrace< std::string > >( ctx, oss.str(), iteration ); - - auto overhead_trace = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "overhead" ); - #endif - - using fun_type = typename std::remove_reference< F >::type; - - if ( filter( iteration ) ) - { - // Copy the functor, since if it has any views we can turn on view tracking - std::vector< KokkosResilience::ViewHolder > views; - - // Don't do anything with const views since they can never be checkpointed in this context - KokkosResilience::DynamicViewHooks::copy_constructor_set.set_callback( [&views]( const KokkosResilience::ViewHolder &view ) { - views.emplace_back( view ); - } ); - - std::vector< Detail::CrefImpl > crefs; - Detail::Cref::check_ref_list = &crefs; - - fun_type f = fun; - - Detail::Cref::check_ref_list = nullptr; - - KokkosResilience::DynamicViewHooks::copy_constructor_set.reset(); - - #ifdef KR_ENABLE_TRACING - auto reg_hashes = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "register" ); - #endif - // Register any views that haven't already been registered - ctx.register_hashes( views, crefs ); - - #ifdef KR_ENABLE_TRACING - reg_hashes.end(); - auto check_restart = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "check" ); - #endif - - bool restart_available = ctx.restart_available( label, iteration ); - #ifdef KR_ENABLE_TRACING - check_restart.end(); - overhead_trace.end(); - #endif - - if ( restart_available ) - { - // Load views with data - #ifdef KR_ENABLE_TRACING - auto restart_trace = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "restart" ); - #endif - ctx.restart( label, iteration, views ); - } else - { - // Execute functor and checkpoint - #ifdef KR_ENABLE_TRACING - auto function_trace = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "function" ); - #endif - fun(); - #ifdef KR_ENABLE_TRACING - Kokkos::fence(); // Get accurate measurements for function_trace end - function_trace.end(); - #endif - - { - #ifdef KR_ENABLE_TRACING - auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "checkpoint" ); - #endif - auto ts = std::chrono::system_clock::to_time_t( std::chrono::system_clock::now() ); - std::cout << '[' << std::put_time( std::localtime( &ts ), "%c" ) << "] initiating checkpoint\n"; - ctx.checkpoint( label, iteration, views ); - } - } - } else { // Iteration is filtered, just execute - #ifdef KR_ENABLE_TRACING - overhead_trace.end(); - auto function_trace = Util::begin_trace< Util::TimingTrace< std::string > >( ctx, "function" ); - #endif - fun(); - #ifdef KR_ENABLE_TRACING - Kokkos::fence(); // Get accurate measurements for function_trace end - function_trace.end(); - #endif + using namespace Util; +#ifdef KR_ENABLE_TRACING + //Only build iteration label if tracing. + std::ostringstream oss; + oss << "checkpoint_" << label; + auto chk_trace = begin_trace(*this, oss.str(), iteration); +#endif + auto overhead_trace = begin_trace( *this, "overhead" ); + + active_context = this; + + //Figure out how we should be handling this + bool recover_region = false, checkpoint_region = false; + + auto parent_region = active_region; + auto parent_context = active_context; + auto* parent_filter = active_filter; + + if(last_region && last_region.label() == label) { + active_region = last_region; + } else { + auto info = regions.insert({std::string(label), std::unordered_set()}); + active_region = info.first; + } + std::function< bool(int) > m_filter = filter; + active_filter = &m_filter; + + + if(filter(iteration)){ + //Make sure the data members are registered to the context for this region. + detect_and_register(std::forward(fun), explicit_members...); + + auto check_restart = begin_trace( *this, "check" ); + recover_region = this->restart_available(label, iteration); + check_restart.end(); + + checkpoint_region = !recover_region; + } + overhead_trace.end(); + + + if(recover_region){ + auto restart_trace = begin_trace( *this, "restart" ); + this->restart(active_region.label(), iteration, active_region.members()); + } else { + enter_region(active_region, iteration); + auto function_trace = begin_trace( *this, "function" ); + fun(); + function_trace.end(); + exit_region(active_region, iteration); + + if(checkpoint_region){ + auto write_trace = begin_trace( *this, "checkpoint" ); + auto ts = std::chrono::system_clock::to_time_t( std::chrono::system_clock::now() ); + if(m_pid == 0) std::cout << '[' << std::put_time( std::localtime( &ts ), "%c" ) << "] initiating checkpoint of iteration " << iteration << "\n"; + this->checkpoint(active_region.label(), iteration, active_region.members()); + write_trace.end(); } } + + + //Region no longer active + last_region = active_region; + active_region = parent_region; + active_context = parent_context; + active_filter = parent_filter; + } + + //RegionFunc = std::function; + //FilterFunc = std::function; + + template + bool register_if_active(T& member, std::string label){ + if(ContextBase::active_context != nullptr){ + return ContextBase::active_context->register_if_active(member, label); + } + return false; + } + + template + bool deregister_if_active(T& member, std::string label){ + if(ContextBase::active_context != nullptr){ + return ContextBase::active_context->deregister_if_active(member, label); + } + return false; } - template< typename Context, typename F, typename FilterFunc > - void checkpoint( Context &ctx, const std::string &label, int iteration, F &&fun, FilterFunc &&filter ) + template + void checkpoint( ContextBase& ctx, const std::string &label, int iteration, F &&fun, FilterFunc &&filter, Detail::RegInfo... explicit_members ) { - Detail::checkpoint_impl( ctx, label, iteration, std::forward< F >( fun ), std::forward< FilterFunc >( filter ) ); + ctx.run(label, iteration, std::forward< F >( fun ), std::forward< FilterFunc >( filter ), explicit_members...); } - template< typename Context, typename F > - void checkpoint( Context &ctx, const std::string &label, int iteration, F &&fun ) + template< typename... Traits, typename F, typename... T> + void checkpoint( ContextBase& ctx, const std::string &label, int iteration, F&& fun, Detail::RegInfo... explicit_members ) { - Detail::checkpoint_impl( ctx, label, iteration, std::forward< F >( fun ), ctx.default_filter() ); + ctx.run(label, iteration, std::forward< F >( fun ), ctx.default_filter(), explicit_members... ); } } diff --git a/src/resilience/CMakeLists.txt b/src/resilience/CMakeLists.txt index e837e93..de91269 100644 --- a/src/resilience/CMakeLists.txt +++ b/src/resilience/CMakeLists.txt @@ -1,39 +1,24 @@ target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Resilience.cpp - ${CMAKE_CURRENT_LIST_DIR}/AutomaticCheckpoint.cpp - ${CMAKE_CURRENT_LIST_DIR}/Context.cpp ${CMAKE_CURRENT_LIST_DIR}/Config.cpp - ${CMAKE_CURRENT_LIST_DIR}/Cref.cpp ${CMAKE_CURRENT_LIST_DIR}/ResilientRef.cpp ) -if (KR_ENABLE_MPI_BACKENDS) - target_sources(resilience PRIVATE MPIContext.cpp) -endif() - -add_subdirectory(filesystem) -add_subdirectory(stdio) - -if (KR_ENABLE_VELOC) - add_subdirectory(veloc) -endif() +add_subdirectory(util) -if (KR_ENABLE_STDFILE) - target_sources(resilience PRIVATE StdFileContext.cpp) - add_subdirectory(stdfile) -endif() - -if (KR_ENABLE_HDF5) - add_subdirectory(hdf5) -endif() +add_subdirectory(registration) +add_subdirectory(context) +add_subdirectory(backend) add_subdirectory(view_hooks) -if (KR_CUDA_EXEC_SPACE) +add_subdirectory(stdio) + +if (KR_ENABLE_CUDA_EXEC_SPACE) add_subdirectory(cuda) endif() -if (KR_OPENMP_EXEC_SPACE) +if (KR_ENABLE_OPENMP_EXEC_SPACE) add_subdirectory(openMP) endif() diff --git a/src/resilience/Config.cpp b/src/resilience/Config.cpp index 6dc5c2c..713e64f 100644 --- a/src/resilience/Config.cpp +++ b/src/resilience/Config.cpp @@ -70,6 +70,10 @@ namespace KokkosResilience { std::ifstream instrm{ p.string() }; + if(!instrm.is_open() || !instrm.good()){ + throw ConfigFileError(p.string()); + } + using iter_type = std::istream_iterator< char >; iter_type input( instrm ); diff --git a/src/resilience/Config.hpp b/src/resilience/Config.hpp index afcfd00..292ba13 100644 --- a/src/resilience/Config.hpp +++ b/src/resilience/Config.hpp @@ -62,8 +62,15 @@ namespace KokkosResilience struct ConfigValueError : std::runtime_error { - ConfigValueError() - : std::runtime_error( "value error" ) + ConfigValueError(const std::string desc = "value error") + : std::runtime_error(desc) + {} + }; + + struct ConfigFileError : std::runtime_error + { + ConfigFileError(const std::string& filename) + : std::runtime_error( "error opening file: " + filename ) {} }; diff --git a/src/resilience/Context.hpp b/src/resilience/Context.hpp deleted file mode 100644 index 6c9db6e..0000000 --- a/src/resilience/Context.hpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * - * Kokkos v. 3.0 - * Copyright (2020) National Technology & Engineering - * Solutions of Sandia, LLC (NTESS). - * - * Under the terms of Contract DE-NA0003525 with NTESS, - * the U.S. Government retains certain rights in this software. - * - * Kokkos is licensed under 3-clause BSD terms of use: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the Corporation nor the names of the - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Questions? Contact Christian R. Trott (crtrott@sandia.gov) - */ -#ifndef INC_RESILIENCE_CONTEXT_HPP -#define INC_RESILIENCE_CONTEXT_HPP - -#include -#if defined(KOKKOS_ENABLE_HPX) -#include -#endif -#include -#include -#include -#include -#include -#include "Config.hpp" -#include "Cref.hpp" -#include "CheckpointFilter.hpp" -#include -#include "view_hooks/ViewHolder.hpp" -#ifdef KR_ENABLE_MPI_BACKENDS -#include -#endif - -// Tracing support -#ifdef KR_ENABLE_TRACING -#include "util/Trace.hpp" -#endif - -namespace KokkosResilience -{ - namespace detail - { - } - - class ContextBase - { - public: - - explicit ContextBase( Config cfg ); - - virtual ~ContextBase() = default; - - virtual void register_hashes(const std::vector< KokkosResilience::ViewHolder > &views, - const std::vector< Detail::CrefImpl > &crefs) = 0; - virtual bool restart_available( const std::string &label, int version ) = 0; - virtual void restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ) = 0; - virtual void checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ) = 0; - - virtual int latest_version( const std::string &label ) const noexcept = 0; - virtual void register_alias( const std::string &original, const std::string &alias ) = 0; - - virtual void reset() = 0; - - const std::function< bool( int ) > &default_filter() const noexcept { return m_default_filter; } - - Config &config() noexcept { return m_config; } - const Config &config() const noexcept { return m_config; } - -#ifdef KR_ENABLE_TRACING - Util::detail::TraceStack &trace() { return m_trace; }; -#endif - - private: - - Config m_config; - - std::function< bool( int ) > m_default_filter; - -#ifdef KR_ENABLE_TRACING - Util::detail::TraceStack m_trace; -#endif - }; - - std::unique_ptr< ContextBase > make_context( const std::string &config ); -#ifdef KR_ENABLE_MPI_BACKENDS - std::unique_ptr< ContextBase > make_context( MPI_Comm comm, const std::string &config ); -#endif -#ifdef KR_ENABLE_STDFILE - std::unique_ptr< ContextBase > make_context( const std::string &filename, const std::string &config ); -#endif -} - -#endif // INC_RESILIENCE_CONTEXT_HPP diff --git a/src/resilience/Resilience.hpp b/src/resilience/Resilience.hpp index c473c12..a31cb0b 100644 --- a/src/resilience/Resilience.hpp +++ b/src/resilience/Resilience.hpp @@ -41,19 +41,17 @@ #ifndef INC_RESILIENCE_RESILIENCE_HPP #define INC_RESILIENCE_RESILIENCE_HPP -#include +#include "Config.hpp" -#include "Context.hpp" -#include "ManualCheckpoint.hpp" - -#ifdef KR_ENABLE_VELOC -#include "veloc/VelocBackend.hpp" +#include "context/Context.hpp" #include "AutomaticCheckpoint.hpp" -#endif -#ifdef KR_ENABLE_STDFILE -#include "stdfile/StdFileBackend.hpp" -#include "AutomaticCheckpoint.hpp" +#include "registration/RegistrationHeaders.hpp" + +#include "backend/Automatic.hpp" + +#ifdef KR_ENABLE_MANUAL_CHECKPOINT +#include "ManualCheckpoint.hpp" #endif #ifdef KR_ENABLE_CUDA diff --git a/src/resilience/backend/Automatic.cpp b/src/resilience/backend/Automatic.cpp new file mode 100644 index 0000000..7466f32 --- /dev/null +++ b/src/resilience/backend/Automatic.cpp @@ -0,0 +1,30 @@ +#include "Automatic.hpp" +#include "resilience/context/ContextBase.hpp" +#include + +#ifdef KR_ENABLE_VELOC +#include "veloc/VelocBackend.hpp" +#endif + +#ifdef KR_ENABLE_STDFILE +#include "stdfile/StdFileBackend.hpp" +#endif + +namespace KokkosResilience::Detail { + AutomaticBackend make_backend(ContextBase& ctx){ + auto backend = ctx.config()["backend"].as(); + +#ifdef KR_ENABLE_VELOC + if(backend == "veloc"){ + return std::make_shared(ctx); + } +#endif +#ifdef KR_ENABLE_STDFILE + if(backend == "stdfile"){ + return std::make_shared(ctx); + } +#endif + + throw std::runtime_error(backend + " backend is not available"); + } +} diff --git a/src/resilience/Cref.cpp b/src/resilience/backend/Automatic.hpp similarity index 88% rename from src/resilience/Cref.cpp rename to src/resilience/backend/Automatic.hpp index 91313f6..6a58275 100644 --- a/src/resilience/Cref.cpp +++ b/src/resilience/backend/Automatic.hpp @@ -38,12 +38,15 @@ * * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ -#include "Cref.hpp" +#ifndef INC_RESILIENCE_BACKEND_AUTOMATIC_HPP +#define INC_RESILIENCE_BACKEND_AUTOMATIC_HPP -namespace KokkosResilience -{ - namespace Detail - { - std::vector< CrefImpl > *Cref::check_ref_list = nullptr; - } -} \ No newline at end of file + +#include "AutomaticBase.hpp" +#include "resilience/Config.hpp" + +namespace KokkosResilience::Detail { + AutomaticBackend make_backend(ContextBase& ctx); +} + +#endif diff --git a/src/resilience/backend/AutomaticBase.hpp b/src/resilience/backend/AutomaticBase.hpp new file mode 100644 index 0000000..970c4f8 --- /dev/null +++ b/src/resilience/backend/AutomaticBase.hpp @@ -0,0 +1,99 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ +#ifndef INC_RESILIENCE_BACKEND_AUTOMATICBASE_HPP +#define INC_RESILIENCE_BACKEND_AUTOMATICBASE_HPP + +#include "resilience/registration/Registration.hpp" +#include +#include + +namespace KokkosResilience { + +//Avoiding cyclic dependency. +class ContextBase; + +class AutomaticBackendBase { +public: + explicit AutomaticBackendBase(ContextBase& ctx) : m_context(ctx) {}; + + virtual ~AutomaticBackendBase() = default; + + //All members should be registered before being checkpointed or restarted + virtual void register_member(Registration member) = 0; + virtual void deregister_member(const Registration& member) = 0; + + //as_global to checkpoint indepently of PID + virtual bool checkpoint(const std::string& label, int version, + const std::unordered_set &members, + bool as_global = false) = 0; + + //Get the highest version available which is still less than max + // (or just the highest, if max=0) + virtual int latest_version(const std::string& label, int max = 0, bool as_global = false) const noexcept = 0; + + //Returns failure flag for recovering the specified members. + //as_global to restart independently of PID + virtual bool restart(const std::string& label, int version, + const std::unordered_set &members, + bool as_global = false) = 0; + + //Reset any state, useful for online-recovery. + virtual void reset() = 0; + + virtual bool restart_available(const std::string& label, int version, bool as_global = false){ + return latest_version(label, version+1, as_global) == version; + }; + + ContextBase& m_context; + + + //Delete potentially problematic functions for maintaining consistent state + AutomaticBackendBase(const AutomaticBackendBase&) = delete; + AutomaticBackendBase(AutomaticBackendBase&&) noexcept = delete; + AutomaticBackendBase &operator=( const AutomaticBackendBase & ) = delete; + AutomaticBackendBase &operator=( AutomaticBackendBase && ) = default; +}; + +using AutomaticBackend = std::shared_ptr; +} + + +#endif diff --git a/src/resilience/backend/CMakeLists.txt b/src/resilience/backend/CMakeLists.txt new file mode 100644 index 0000000..55159b0 --- /dev/null +++ b/src/resilience/backend/CMakeLists.txt @@ -0,0 +1,13 @@ +target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Automatic.cpp) + +if (KR_ENABLE_VELOC) + add_subdirectory(veloc) +endif() + +add_subdirectory(stdfile) + +if (KR_ENABLE_HDF5) + add_subdirectory(hdf5) +endif() + +add_subdirectory(filesystem) diff --git a/src/resilience/filesystem/CMakeLists.txt b/src/resilience/backend/filesystem/CMakeLists.txt similarity index 100% rename from src/resilience/filesystem/CMakeLists.txt rename to src/resilience/backend/filesystem/CMakeLists.txt diff --git a/src/resilience/filesystem/DirectoryManagement.hpp b/src/resilience/backend/filesystem/DirectoryManagement.hpp similarity index 100% rename from src/resilience/filesystem/DirectoryManagement.hpp rename to src/resilience/backend/filesystem/DirectoryManagement.hpp diff --git a/src/resilience/filesystem/ExternalIOInterface.cpp b/src/resilience/backend/filesystem/ExternalIOInterface.cpp similarity index 100% rename from src/resilience/filesystem/ExternalIOInterface.cpp rename to src/resilience/backend/filesystem/ExternalIOInterface.cpp diff --git a/src/resilience/filesystem/ExternalIOInterface.hpp b/src/resilience/backend/filesystem/ExternalIOInterface.hpp similarity index 100% rename from src/resilience/filesystem/ExternalIOInterface.hpp rename to src/resilience/backend/filesystem/ExternalIOInterface.hpp diff --git a/src/resilience/filesystem/Filesystem.cpp b/src/resilience/backend/filesystem/Filesystem.cpp similarity index 100% rename from src/resilience/filesystem/Filesystem.cpp rename to src/resilience/backend/filesystem/Filesystem.cpp diff --git a/src/resilience/filesystem/Filesystem.hpp b/src/resilience/backend/filesystem/Filesystem.hpp similarity index 100% rename from src/resilience/filesystem/Filesystem.hpp rename to src/resilience/backend/filesystem/Filesystem.hpp diff --git a/src/resilience/hdf5/CMakeLists.txt b/src/resilience/backend/hdf5/CMakeLists.txt similarity index 100% rename from src/resilience/hdf5/CMakeLists.txt rename to src/resilience/backend/hdf5/CMakeLists.txt diff --git a/src/resilience/hdf5/HDF5Space.cpp b/src/resilience/backend/hdf5/HDF5Space.cpp similarity index 100% rename from src/resilience/hdf5/HDF5Space.cpp rename to src/resilience/backend/hdf5/HDF5Space.cpp diff --git a/src/resilience/hdf5/HDF5Space.hpp b/src/resilience/backend/hdf5/HDF5Space.hpp similarity index 99% rename from src/resilience/hdf5/HDF5Space.hpp rename to src/resilience/backend/hdf5/HDF5Space.hpp index c6b0cb3..a76474f 100644 --- a/src/resilience/hdf5/HDF5Space.hpp +++ b/src/resilience/backend/hdf5/HDF5Space.hpp @@ -52,7 +52,7 @@ #include #include #include -#include "resilience/filesystem/ExternalIOInterface.hpp" +#include "resilience/backend/filesystem/ExternalIOInterface.hpp" #include diff --git a/src/resilience/stdfile/CMakeLists.txt b/src/resilience/backend/stdfile/CMakeLists.txt similarity index 100% rename from src/resilience/stdfile/CMakeLists.txt rename to src/resilience/backend/stdfile/CMakeLists.txt diff --git a/src/resilience/backend/stdfile/StdFileBackend.cpp b/src/resilience/backend/stdfile/StdFileBackend.cpp new file mode 100644 index 0000000..fa79d9f --- /dev/null +++ b/src/resilience/backend/stdfile/StdFileBackend.cpp @@ -0,0 +1,280 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ +#include +#include +#include + +#include "StdFileBackend.hpp" +#include "resilience/context/ContextBase.hpp" +#include "resilience/util/Trace.hpp" + +namespace KokkosResilience { + +StdFileBackend::StdFileBackend(ContextBase& ctx) + : AutomaticBackendBase(ctx) { + + auto config = m_context.config()["backends"]["stdfile"]; + auto config_dir = config.get("directory"); + auto config_prefix = config.get("filename_prefix"); + + if(config_dir) { + checkpoint_dir = config_dir->template as(); + + using namespace std::filesystem; + + std::error_code err; + create_directories(checkpoint_dir, err); + if(err) throw ConfigValueError("backends:stdfile:directory invalid " + err.message()); + + if(status(checkpoint_dir).type() != file_type::directory){ + throw ConfigValueError("backends:stdfile:directory not actually a directory"); + } + } + + if(config_prefix) { + checkpoint_prefix = config_prefix->template as(); + } +} + + + +bool StdFileBackend::checkpoint( + const std::string &label, int version, + const std::unordered_set& members, + bool as_global) { + + //Files have a header: ... + //Header lasts until Member0 offset + const size_t header_size = sizeof(size_t) + (sizeof(int)+sizeof(size_t))*members.size(); + + std::vector member_hashes(members.size()); + std::vector member_offsets(members.size()); + + auto filename = checkpoint_file(label, version, as_global); + + auto write_trace = Util::begin_trace(m_context, "write"); + bool success = true; + try { + std::ofstream file(filename, std::ios::binary); + file.seekp(header_size); + + size_t index = 0; + for (auto& member : members) { + member_hashes[index] = member.hash(); + member_offsets[index] = file.tellp(); + + if(!member->serializer()(file)){ + fprintf(stderr, "Warning: In checkpoint of %s version %d, member %s serialization failed!\n", label.c_str(), version, member->name.c_str()); + success = false; + } + index++; + } + + size_t full_size = file.tellp(); + + file.seekp(0); + file.write((char*) &full_size, sizeof(full_size)); + + for(index = 0; index < members.size(); index++){ + file.write((char*) &member_hashes[index], sizeof(int)); + file.write((char*) &member_offsets[index], sizeof(size_t)); + } + + latest_versions[label] = version; + } catch (std::exception& e) { + fprintf(stderr, "Error checkpointing region %s version %d to file %s: %s\n", + label.c_str(), version, std::string(filename).c_str(), e.what()); + success = false; + } + write_trace.end(); + return success; +} + + +std::filesystem::path StdFileBackend::checkpoint_file( + const std::string& label, int version, bool as_global) const { + return checkpoint_dir / (checkpoint_prefix + label + + ( as_global ? "" : "." + std::to_string(m_context.m_pid) ) + + "." + std::to_string(version)); +} + +bool StdFileBackend::restart_available(const std::string &label, int version, bool as_global) { + return std::filesystem::exists(checkpoint_file(label, version, as_global)); +} + +int StdFileBackend::latest_version(const std::string &label, int max, bool as_global) const noexcept { + + auto iter = latest_versions.find(label); + if(iter != latest_versions.end() && (max == 0 || iter->second < max)) return iter->second; + + int result = -1; + bool successful = false; + try { + std::string basename = checkpoint_file(label, 0, as_global).filename().stem(); + + for(auto& dir_entry : std::filesystem::directory_iterator(checkpoint_dir)){ + path filename = dir_entry.path().filename(); + + if(filename.stem() != basename) continue; + + std::string file_ext = filename.extension(); + if(file_ext.size() < 2) continue; + + //Remove the dot from, eg, ".100" + file_ext.erase(0,1); + try { + int version = stoi(file_ext); + if(max == 0 || version < max) { + result = result < version ? version : result; + } + } catch(...) {} + } + + successful = true; + } catch(...) {} + + if(max == 0 && successful) latest_versions[label] = result; + + return result; +} + + + +struct FileMember { + size_t start, stop; + const Registration* registration = nullptr; +}; + +//We want to read through the file in-order where possible, +//so we build an ordered vector representing which registration to +//restore to as we go. +//File path used for providing error context. +std::vector +read_header(std::istream& file, const std::unordered_set& registrations, std::filesystem::path filename){ + std::vector members; + std::unordered_map hash_to_member; + + size_t file_size; + file.read((char*) &file_size, sizeof(size_t)); + + size_t header_size = file_size; //temporary estimate + while(size_t(file.tellg()) < header_size){ + members.push_back(FileMember()); + int idx = members.size() - 1; + + int hash; size_t start; + file.read((char*) &hash, sizeof(int)); + file.read((char*) &start, sizeof(size_t)); + + members[idx].start = start; + if(idx > 0) members[idx-1].stop = start; + + hash_to_member[hash] = idx; + + //Fix estimate after we pull the first member info. + if(header_size == file_size){ + header_size = start; + } + } + if(members.empty()){ + fprintf(stderr, "No members found in file %s but %lu expected.\n", std::string(filename).c_str(), registrations.size()); + return {}; + } else { + members.back().stop = file_size; + } + + for(auto& reg : registrations){ + auto iter = hash_to_member.find(reg.hash()); + if(iter == hash_to_member.end()){ + fprintf(stderr, "Warning: Checkpoint is missing member %s!\n", reg->name.c_str()); + } + + int idx = iter->second; + members[idx].registration = ® + } + + return members; +} + +bool StdFileBackend::restart( + const std::string &label, int version, + const std::unordered_set& registrations, + bool as_global) { + auto read_trace = Util::begin_trace(m_context, "read"); + if(registrations.empty()){ + read_trace.end(); + return true; + } + + try { + auto filename = checkpoint_file(label, version, as_global); + std::ifstream file(filename, std::ios::binary); + + auto file_members = read_header(file, registrations, filename); + + for(auto& member : file_members){ + if(member.registration == nullptr) continue; + + file.seekg(member.start); + + const Registration& reg = *(member.registration); + if(!reg->deserializer()(file)){ + fprintf(stderr, "Warning: In restart of %s version %d, member %s deserialization failed!\n", label.c_str(), version, reg->name.c_str()); + } + + size_t actual_stop = file.tellg(); + if(actual_stop != member.stop){ + bool more = actual_stop > member.stop; + size_t amt = more ? actual_stop-member.stop : member.stop-actual_stop; + fprintf(stderr, "Warning: In restart of %s version %d, member %s deserialized with %lu %s bytes than in the checkpoint!\n", + label.c_str(), version, reg->name.c_str(), amt, more ? "more" : "fewer"); + } + } + } catch (...) { + read_trace.end(); + return false; + } + + read_trace.end(); + return true; +} + +} // namespace KokkosResilience diff --git a/src/resilience/stdfile/StdFileBackend.hpp b/src/resilience/backend/stdfile/StdFileBackend.hpp similarity index 64% rename from src/resilience/stdfile/StdFileBackend.hpp rename to src/resilience/backend/stdfile/StdFileBackend.hpp index 4ce24bf..daa9dbb 100644 --- a/src/resilience/stdfile/StdFileBackend.hpp +++ b/src/resilience/backend/stdfile/StdFileBackend.hpp @@ -41,44 +41,43 @@ #ifndef INC_RESILIENCE_STDFILE_STDFILEBACKEND_HPP #define INC_RESILIENCE_STDFILE_STDFILEBACKEND_HPP -#include -#include "../view_hooks/ViewHolder.hpp" +#include "resilience/backend/AutomaticBase.hpp" -#include -#include -#include - -#include "../Cref.hpp" -#include "../StdFileContext.hpp" +#include +#include namespace KokkosResilience { -class StdFileBackend { +class StdFileBackend : public AutomaticBackendBase { public: - StdFileBackend(StdFileContext &ctx, - std::string const &filename); - ~StdFileBackend(); + StdFileBackend(ContextBase& ctx); + ~StdFileBackend() = default; - void checkpoint( - const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views); + //No state to manage + void register_member(Registration) override {}; + void deregister_member(const Registration &) override {} - bool restart_available(const std::string &label, int version); - int latest_version(const std::string &label) const noexcept; + bool checkpoint(const std::string &label, int version, + const std::unordered_set& members, bool as_global) override; - void restart( - const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views); + int latest_version(const std::string &label, int max, bool as_global) const noexcept override; + bool restart_available(const std::string& label, int version, bool as_global) override; - void reset() {} + bool restart(const std::string &label, int version, + const std::unordered_set& members, bool as_global) override; - void register_hashes( - const std::vector< KokkosResilience::ViewHolder > &views, - const std::vector &crefs) {} + //No state to reset + void reset() override {}; private: - std::string m_filename; - StdFileContext &m_context; + using path = std::filesystem::path; + path checkpoint_dir = "./"; + std::string checkpoint_prefix = "kr_chkpt_"; + + mutable std::unordered_map latest_versions; + + //The file to checkpoint/recover with + path checkpoint_file(const std::string& label, int version, bool as_global) const; }; } // namespace KokkosResilience diff --git a/src/resilience/veloc/CMakeLists.txt b/src/resilience/backend/veloc/CMakeLists.txt similarity index 100% rename from src/resilience/veloc/CMakeLists.txt rename to src/resilience/backend/veloc/CMakeLists.txt diff --git a/src/resilience/backend/veloc/VelocBackend.cpp b/src/resilience/backend/veloc/VelocBackend.cpp new file mode 100644 index 0000000..83c239b --- /dev/null +++ b/src/resilience/backend/veloc/VelocBackend.cpp @@ -0,0 +1,285 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#include "VelocBackend.hpp" + +#include +#include +#include +#include + +#include "resilience/AutomaticCheckpoint.hpp" + +#include "resilience/registration/Registration.hpp" +#include "resilience/util/Trace.hpp" + +#define VELOC_SAFE_CALL( call ) KokkosResilience::veloc_internal_safe_call( call, #call, __FILE__, __LINE__ ) + +namespace KokkosResilience +{ + namespace + { + void veloc_internal_error_throw( bool success, const char *name, const char *file, int line = 0 ) + { + std::ostringstream out; + out << name << " error: VELOC operation failed"; + if ( file ) + { + out << " " << file << ":" << line; + } + + // TODO: implement exception class + //Kokkos::Impl::throw_runtime_exception( out.str() ); + } + + inline bool veloc_internal_safe_call( bool success, const char *name, const char *file, int line = 0 ) + { + if ( !success ) + veloc_internal_error_throw( success, name, file, line ); + return success; + } + } + + VeloCMemoryBackend::VeloCMemoryBackend(ContextBase& ctx) + : AutomaticBackendBase(ctx) { + const auto &vconf = m_context.config()["backends"]["veloc"]["config"].as< std::string >(); + veloc_client = veloc::get_client(static_cast< unsigned int >( ctx.m_pid ), vconf); + } + + VeloCMemoryBackend::~VeloCMemoryBackend() + { + veloc_client->checkpoint_wait(); + } + + bool VeloCMemoryBackend::checkpoint( const std::string &label, int version, + std::unordered_set< KokkosResilience::Registration > const &_members, + bool as_global) + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + bool success; + + //Don't handle failure here, might be worth trying to continue + VELOC_SAFE_CALL( veloc_client->checkpoint_wait() ); + + success = VELOC_SAFE_CALL( veloc_client->checkpoint_begin( label, version ) ); + + if(success) { + std::set ids; + for(auto member : _members) ids.insert(static_cast(member.hash())); + std::cout << "checkpointing ids ("; + std::size_t count = 0; + for ( auto &&id : ids ) + { + std::cout << id; + if ( ++count != ids.size() ) + std::cout << id << ", "; + } + std::cout << ")\n"; + success = VELOC_SAFE_CALL( veloc_client->checkpoint_mem(VELOC_CKPT_SOME, ids) ); + } + + success = VELOC_SAFE_CALL( veloc_client->checkpoint_end( success )); + + if(success) m_latest_version[label] = version; + return success; + } + + int + VeloCMemoryBackend::latest_version( const std::string &label, int max, bool as_global) const noexcept + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + auto latest_iter = m_latest_version.find( label ); + if ( latest_iter == m_latest_version.end() ) + { + auto test = veloc_client->restart_test(label, max); + + //We store the absolute latest version only + if(max == 0) m_latest_version[label] = test; + + return test; + } else if(max != 0 && latest_iter->second >= max) { + return veloc_client->restart_test(label, max); + } else { + return latest_iter->second; + } + } + + bool + VeloCMemoryBackend::restart( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &_members, bool as_global) + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + bool success; + success = VELOC_SAFE_CALL( veloc_client->restart_begin( label, version )); + + if(success){ + std::set ids; + for(auto member : _members) ids.insert(static_cast(member.hash())); + std::cout << "restarting ids ("; + std::size_t count = 0; + for ( auto &&id : ids ) + { + std::cout << id; + if ( ++count != ids.size() ) + std::cout << ", "; + } + std::cout << ")\n"; + success = VELOC_SAFE_CALL( veloc_client->recover_mem(VELOC_RECOVER_SOME, ids) ); + } + + success = VELOC_SAFE_CALL( veloc_client->restart_end( success ) ); + + return success; + } + + void + VeloCMemoryBackend::reset() + { + const auto &vconf = m_context.config()["backends"]["veloc"]["config"].as< std::string >(); + veloc_client = veloc::get_client(static_cast< unsigned int >( m_context.m_pid ), vconf); + + m_latest_version.clear(); + } + + void + VeloCMemoryBackend::register_member(KokkosResilience::Registration member) + { + auto sfun = member->serializer(); + if ( !sfun ) + throw std::runtime_error( "invalid member serializer" ); + auto dfun = member->deserializer(); + if ( !dfun ) + throw std::runtime_error( "invalid member deserializer" ); + veloc_client->mem_protect( + static_cast(member.hash()), + std::move(sfun), + std::move(dfun) + ); + } + + void + VeloCMemoryBackend::deregister_member(const Registration &member) + { + veloc_client->mem_unprotect(static_cast(member.hash())); + } + + VeloCFileBackend::VeloCFileBackend(ContextBase& ctx) + : AutomaticBackendBase(ctx) { + const auto &vconf = m_context.config()["backends"]["veloc"]["config"].as< std::string >(); + veloc_client = veloc::get_client( static_cast< unsigned int >( m_context.m_pid ), vconf ); + } + + VeloCFileBackend::~VeloCFileBackend() + { + veloc_client->checkpoint_wait(); + } + + bool + VeloCFileBackend::checkpoint( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &members, + bool as_global) + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + bool success; + + // Wait for previous checkpoint to finish + VELOC_SAFE_CALL( veloc_client->checkpoint_wait()); + + // Start new checkpoint + success = VELOC_SAFE_CALL( veloc_client->checkpoint_begin( label, version )); + + try + { + std::string fname = veloc_client->route_file(""); + + std::ofstream vfile( fname, std::ios::binary ); + + auto write_trace = Util::begin_trace( m_context, "write" ); + for ( auto &&member : members ) { + success &= member->serializer()(vfile); + if(!success) break; + } + write_trace.end(); + } + catch ( ... ) + { + success = false; + } + + success = VELOC_SAFE_CALL( veloc_client->checkpoint_end( success )); + return success; + } + + int VeloCFileBackend::latest_version( const std::string &label, int max, bool as_global) const noexcept + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + return veloc_client->restart_test( label, max ); + } + + bool VeloCFileBackend::restart( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &members, + bool as_global) + { + if(as_global) fprintf(stderr, "Warning, VeloC backend does not support checkpointing global objects\n"); + bool success; + success = VELOC_SAFE_CALL( veloc_client->restart_begin( label, version )); + + if(success) { + try { + std::string fname = veloc_client->route_file(""); + + std::ifstream vfile( fname, std::ios::binary ); + + auto read_trace = Util::begin_trace( m_context, "read" ); + for ( auto &&member : members ){ + success = member->deserializer()(vfile); + if(!success) break; + } + read_trace.end(); + } catch ( ... ) { + success = false; + } + } + + success = VELOC_SAFE_CALL( veloc_client->restart_end( success )); + return success; + } +} diff --git a/src/resilience/veloc/VelocBackend.hpp b/src/resilience/backend/veloc/VelocBackend.hpp similarity index 51% rename from src/resilience/veloc/VelocBackend.hpp rename to src/resilience/backend/veloc/VelocBackend.hpp index 4e361b5..a668d97 100644 --- a/src/resilience/veloc/VelocBackend.hpp +++ b/src/resilience/backend/veloc/VelocBackend.hpp @@ -42,86 +42,28 @@ #define INC_RESILIENCE_VELOC_VELOCBACKEND_HPP #include -#include +#include #include -#include -#include "../view_hooks/ViewHolder.hpp" #include #include #include -#include "../Cref.hpp" - -namespace KokkosResilience -{ - class ContextBase; - - template< typename Backend > - class MPIContext; - - namespace Detail - { - struct MemProtectKey - { - explicit MemProtectKey( void *maddr ) - : addr( reinterpret_cast< std::uintptr_t >( maddr ) ) - {} - - std::uintptr_t addr; - - friend bool operator==( const MemProtectKey &_lhs, const MemProtectKey &_rhs ) - { - return _lhs.addr == _rhs.addr; - } - - friend bool operator!=( const MemProtectKey &_lhs, const MemProtectKey &_rhs ) - { - return !( _lhs == _rhs ); - } - - friend bool operator<( const MemProtectKey &_lhs, const MemProtectKey &_rhs ) - { - return _lhs.addr < _rhs.addr; - } - }; - - struct MemProtectBlock - { - explicit MemProtectBlock( int mid ) - : id( mid ) - {} - - int id; - std::vector< unsigned char > buff; - void *ptr = nullptr; - std::size_t size = 0; - std::size_t element_size = 0; - bool protect = false; - bool registered = false; - }; - } -} - +#include "veloc.hpp" +#include +#include "resilience/view_hooks/ViewHolder.hpp" +#include "resilience/registration/Registration.hpp" +#include "resilience/context/ContextBase.hpp" +#include "resilience/context/MPIContext.hpp" -namespace std -{ - template<> - struct hash< KokkosResilience::Detail::MemProtectKey > - { - std::size_t operator()( const KokkosResilience::Detail::MemProtectKey &_mem ) const noexcept - { - return std::hash< std::uintptr_t >{}( _mem.addr ); - } - }; -} +#include "resilience/backend/AutomaticBase.hpp" namespace KokkosResilience { - class VeloCMemoryBackend + class VeloCMemoryBackend : public AutomaticBackendBase { public: - VeloCMemoryBackend( ContextBase &ctx, MPI_Comm mpi_comm ); + VeloCMemoryBackend(ContextBase& ctx); ~VeloCMemoryBackend(); VeloCMemoryBackend( const VeloCMemoryBackend & ) = delete; @@ -130,35 +72,36 @@ namespace KokkosResilience VeloCMemoryBackend &operator=( const VeloCMemoryBackend & ) = delete; VeloCMemoryBackend &operator=( VeloCMemoryBackend && ) = default; - void checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + virtual + bool checkpoint( const std::string &label, int version, + const std::unordered_set< Registration > &members, + bool as_global) override; + - bool restart_available( const std::string &label, int version ); - int latest_version (const std::string &label) const noexcept; + int latest_version (const std::string &label, int max, bool as_global) const noexcept override; - void restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + virtual + bool restart( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &members, + bool as_global) override; void clear_checkpoints(); - void register_hashes( const std::vector< KokkosResilience::ViewHolder > &views, - const std::vector< Detail::CrefImpl > &crefs ); + void register_member(Registration member) override; + void deregister_member(const Registration& member) override; - void reset(); + void reset() override; void register_alias( const std::string &original, const std::string &alias ); private: std::string get_canonical_label( const std::string &_label ) const noexcept; - std::unordered_map< std::string, Detail::MemProtectBlock > m_registry; - - MPI_Comm m_mpi_comm; - ContextBase *m_context; + veloc::client_t *veloc_client; mutable std::unordered_map< std::string, int > m_latest_version; - std::unordered_map< std::string, std::string > m_alias_map; - int m_last_id; + + int m_last_id = 0; }; class VeloCRegisterOnlyBackend : public VeloCMemoryBackend @@ -174,30 +117,37 @@ namespace KokkosResilience VeloCRegisterOnlyBackend &operator=( const VeloCRegisterOnlyBackend & ) = delete; VeloCRegisterOnlyBackend &operator=( VeloCRegisterOnlyBackend && ) = default; - void checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + bool checkpoint( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &members, bool as_global) override { + return true; + } - void restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + bool restart( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &members, bool as_global) override { + return true; + }; }; - class VeloCFileBackend + class VeloCFileBackend : AutomaticBackendBase { public: - - VeloCFileBackend( MPIContext< VeloCFileBackend > &ctx, MPI_Comm mpi_comm, const std::string &veloc_config); + VeloCFileBackend( ContextBase& ctx); ~VeloCFileBackend(); - void checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + bool checkpoint( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &views, + bool as_global); + + int latest_version (const std::string &label, int max, bool as_global) const noexcept; - bool restart_available( const std::string &label, int version ); - int latest_version (const std::string &label) const noexcept; + bool restart( const std::string &label, int version, + const std::unordered_set< KokkosResilience::Registration > &views, + bool as_global); - void restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ); + void register_member( KokkosResilience::Registration ) override {} // Do nothing - void register_hashes( const std::vector< KokkosResilience::ViewHolder > & ) {} // Do nothing + private: + veloc::client_t* veloc_client; }; } diff --git a/src/resilience/context/CMakeLists.txt b/src/resilience/context/CMakeLists.txt new file mode 100644 index 0000000..4fb738e --- /dev/null +++ b/src/resilience/context/CMakeLists.txt @@ -0,0 +1,13 @@ +target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Context.cpp) + +if (KR_ENABLE_MPI_BACKENDS) + target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/MPIContext.cpp) +endif() + +if (KR_ENABLE_VT) + add_subdirectory(vt) +endif() + +#if (KR_ENABLE_STDFILE) +# target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/StdFileContext.cpp) +#endif() diff --git a/src/resilience/Context.cpp b/src/resilience/context/Context.cpp similarity index 74% rename from src/resilience/Context.cpp rename to src/resilience/context/Context.cpp index b9f8473..73f1f44 100644 --- a/src/resilience/Context.cpp +++ b/src/resilience/context/Context.cpp @@ -39,15 +39,17 @@ * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ #include "Context.hpp" +#include "resilience/backend/Automatic.hpp" #include #include #include namespace KokkosResilience { - ContextBase::ContextBase( Config cfg ) + ContextBase::ContextBase( Config cfg, int proc_id) : m_config( std::move( cfg ) ), - m_default_filter{ Filter::DefaultFilter{} } + m_default_filter{ Filter::DefaultFilter{} }, + m_pid(proc_id) { auto filter_opt = m_config.get( "filter" ); @@ -65,12 +67,52 @@ namespace KokkosResilience throw std::runtime_error( "invalid filter specified" ); } } + + m_backend = Detail::make_backend(*this); } + ContextBase* ContextBase::active_context = nullptr; + std::unique_ptr< ContextBase > make_context( const std::string &config ) { auto cfg = Config{ config }; return std::unique_ptr< ContextBase >{}; } -} \ No newline at end of file + + char* ContextBase::get_buffer(size_t minimum_size){ + if(buffer.size() < minimum_size){ + buffer.resize(minimum_size); + } + return buffer.data(); + } + + ContextBase::~ContextBase() + { + deregister_all_regions(); + } + + void + ContextBase::reset() + { + reset_impl(); + deregister_all_regions(); + m_backend->reset(); + } + + void + ContextBase::deregister_all_regions() + { + auto region_iter = regions.begin(); + while(region_iter != regions.end()) + { + for ( const auto &member : region_iter->second) + { + if(count_registrations(member) == 1){ + m_backend->deregister_member(member); + } + } + region_iter = regions.erase(region_iter); + } + } +} diff --git a/src/resilience/context/Context.hpp b/src/resilience/context/Context.hpp new file mode 100644 index 0000000..eff992d --- /dev/null +++ b/src/resilience/context/Context.hpp @@ -0,0 +1,76 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_RESILIENCE_CONTEXT_HPP +#define INC_RESILIENCE_CONTEXT_HPP + +#include "resilience/registration/Registration.hpp" + +#include "ContextBase.hpp" + +//#ifdef KR_ENABLE_STDFILE +// #include "StdFileContext.hpp" +//#endif + +#ifdef KR_ENABLE_MPI_BACKENDS + #include + #include "MPIContext.hpp" +#endif + +#ifdef KR_ENABLE_VT + #include + #include "./vt/VTContext.impl.hpp" +#endif + +namespace KokkosResilience { +#ifdef KR_ENABLE_MPI_BACKENDS + std::unique_ptr< ContextBase > make_context( MPI_Comm comm, const std::string &config ); +#endif +#ifdef KR_ENABLE_VT + //theContext just for identifying this context type. + std::unique_ptr< ContextBase > make_context(vt::ctx::Context* theContext, const std::string &config ); +#endif +#ifdef KR_ENABLE_STDFILE + std::unique_ptr< ContextBase > make_context( const std::string &config ); +#endif +} + +#endif // INC_RESILIENCE_CONTEXT_HPP diff --git a/src/resilience/context/ContextBase.hpp b/src/resilience/context/ContextBase.hpp new file mode 100644 index 0000000..c4c74e7 --- /dev/null +++ b/src/resilience/context/ContextBase.hpp @@ -0,0 +1,265 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ +#ifndef INC_RESILIENCE_CONTEXTBASE_HPP +#define INC_RESILIENCE_CONTEXTBASE_HPP + +#include +#if defined(KOKKOS_ENABLE_HPX) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "resilience/Config.hpp" +#include "resilience/CheckpointFilter.hpp" +#include "resilience/registration/Registration.hpp" +#include "resilience/view_hooks/ViewHolder.hpp" +#include "resilience/util/Trace.hpp" +#include "resilience/backend/AutomaticBase.hpp" + +#define KR_GLOBAL_REGION_LABEL "__KR_GLOBAL_REGION__" + +namespace KokkosResilience +{ + class ContextBase + { + public: + using RegionsMap = std::unordered_map>; + + struct Region { + private: + std::string m_label = ""; + std::unordered_set* m_members = nullptr; + public: + Region(RegionsMap::iterator iter) : m_label(iter->first), m_members(&(iter->second)) {}; + Region() {}; + + const std::string& label() const { + return m_label; + } + + std::unordered_set& members(){ + return *m_members; + } + + const std::unordered_set& members() const { + return *m_members; + } + + operator bool() const { + return (m_members != nullptr); + } + }; + + explicit ContextBase( Config cfg , int proc_id = 0); + explicit ContextBase( const std::string& cfg_filename , int proc_id = 0) + : ContextBase(Config{cfg_filename}, proc_id) {}; + + virtual ~ContextBase(); + + template + void run(const std::string& label, int iteration, RegionFunc&& fun, FilterFunc&& filter, + Detail::RegInfo&... explicit_members); + + template + void run(const std::string& label, int iteration, RegionFunc&& fun, + Detail::RegInfo&... explicit_members) { + run(label, iteration, std::forward(fun), default_filter(), explicit_members...); + } + + virtual bool restart_available( const std::string &label, int version ) = 0; + virtual void restart( const std::string& label, int version, + const std::unordered_set< Registration > &members ) = 0; + virtual void checkpoint( const std::string &label, int version, + const std::unordered_set< Registration > &members ) = 0; + + virtual int latest_version( const std::string &label ) const noexcept = 0; + + virtual void register_alias( const std::string &original, const std::string &alias ){ + //TODO + }; + + void reset(); + + virtual void register_member(KokkosResilience::Registration member, Region region){ + const bool inserted = region.members().insert(member).second; + if(inserted && count_registrations(member) == 1) { + m_backend->register_member(member); + } + }; + + virtual void deregister_member(KokkosResilience::Registration member, Region region){ + const bool erased = region.members().erase(member); + if(erased && count_registrations(member) == 0) { + m_backend->deregister_member(member); + } + } + + virtual void enter_region(Region region, int version){ }; + virtual void exit_region(Region region, int version){ }; + + + template + void register_to(const std::string& region_label, T& member, const std::string& member_label = ""){ + regions[region_label]; + register_member(get_reg(member, member_label), regions.find(region_label)); + } + + //Registers to belong to every region. + template + void register_globally(T& member, const std::string& label = ""){ + register_to(KR_GLOBAL_REGION_LABEL, member, label); + } + + //Registers to the active region, requires an active region. + template + void register_to_active(T& member, const std::string& label = ""){ + assert(active_region); + register_member(get_reg(member, label), active_region); + } + + //Registers only if in an active region. Returns true if in an active region. + template + bool register_if_active(T& member, const std::string& label = ""){ + if(active_region) register_to_active(member, label); + return active_region; + } + + template + void deregister_from(const std::string& region_label, T& member, const std::string& member_label = ""){ + regions[region_label]; + deregister_member(get_reg(member, member_label), regions.find(region_label)); + } + + template + void deregister_globally(T& member, const std::string& member_label = ""){ + deregister_from(KR_GLOBAL_REGION_LABEL, member, member_label); + } + + template + void deregister_from_active(T& member, const std::string& member_label = ""){ + deregister_member(get_reg(member, member_label), active_region); + } + + //Deregisters only if in an active region. Returns true if in an active region. + template + bool deregister_if_active(T& member, const std::string& label = ""){ + if(active_region) deregister_from_active(member, label); + return active_region; + } + + const std::function< bool( int ) > &default_filter() const noexcept { return m_default_filter; } + + Config &config() noexcept { return m_config; } + const Config &config() const noexcept { return m_config; } + + Util::detail::TraceStack &trace() { return m_trace; }; + + //Pointer not guaranteed to remain valid, use immediately & discard. + char* get_buffer(size_t minimum_size); + + int count_registrations(const Registration& reg){ + int n_registrations = 0; + for(auto& region : regions){ + auto& region_members = region.second; + auto region_iter = region_members.find(reg); + + if(region_iter != region_members.end()){ + n_registrations++; + } + } + return n_registrations; + } + + + private: + //Traits only does anything for Magistrate serialization + template + Registration get_reg(T& member, const std::string& label){ + return create_registration>(*this, member, label); + } + + template + void register_to_active(Detail::RegInfo& info){ + register_to_active(info.member, info.label); + } + + //Detect views being copied in, register them and any explicitly-listed members. + template + void detect_and_register(RegionFunc&& fun, Detail::RegInfo... explicit_members); + + void deregister_all_regions(); + virtual void reset_impl() = 0; + + //Hold onto a buffer per context for de/serializing non-contiguous or non-host views. + std::vector buffer; + + Config m_config; + + std::function< bool( int ) > m_default_filter; + + Util::detail::TraceStack m_trace; + + protected: + RegionsMap regions; + Region active_region = {}; + + //Performance helper + Region last_region = {}; + + std::function< bool( int ) >* active_filter = nullptr; + + public: + static ContextBase* active_context; + + int m_pid; + AutomaticBackend m_backend; + }; +} + +#endif // INC_RESILIENCE_CONTEXTBASE_HPP diff --git a/src/resilience/AutomaticCheckpoint.cpp b/src/resilience/context/MPIContext.cpp similarity index 90% rename from src/resilience/AutomaticCheckpoint.cpp rename to src/resilience/context/MPIContext.cpp index 64ebbd2..a29c635 100644 --- a/src/resilience/AutomaticCheckpoint.cpp +++ b/src/resilience/context/MPIContext.cpp @@ -38,8 +38,13 @@ * * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ -#include "AutomaticCheckpoint.hpp" -namespace KokkosResilience +#include "MPIContext.hpp" + +namespace KokkosResilience { +std::unique_ptr< ContextBase > +make_context( MPI_Comm comm, const std::string &config ) { -} \ No newline at end of file + return std::make_unique(comm, config); +} +} diff --git a/src/resilience/MPIContext.hpp b/src/resilience/context/MPIContext.hpp similarity index 64% rename from src/resilience/MPIContext.hpp rename to src/resilience/context/MPIContext.hpp index ae50ee2..89c42c2 100644 --- a/src/resilience/MPIContext.hpp +++ b/src/resilience/context/MPIContext.hpp @@ -42,23 +42,33 @@ #define INC_KOKKOS_RESILIENCE_MPICONTEXT_HPP #include -#include "Context.hpp" +#include "ContextBase.hpp" namespace KokkosResilience { -template +//Helper for initialization list +namespace { +static int comm_rank(MPI_Comm &comm){ + int rank; + MPI_Comm_rank(comm, &rank); + return rank; +} +} + class MPIContext : public ContextBase { -public: - explicit MPIContext(MPI_Comm comm, Config &cfg) - : ContextBase(cfg), m_comm(comm), m_backend(*this, comm) {} + public: + explicit MPIContext(MPI_Comm comm, const Config &cfg) + : ContextBase(cfg, comm_rank(comm)), m_comm(comm) {} + explicit MPIContext(MPI_Comm comm, const std::string &cfg) + : ContextBase(cfg, comm_rank(comm)), m_comm(comm) {} - MPIContext(const MPIContext &) = delete; - MPIContext(MPIContext &&) noexcept = default; + MPIContext(const MPIContext &) = delete; + MPIContext(MPIContext &&) noexcept = default; - MPIContext &operator=(const MPIContext &) = delete; - MPIContext &operator=(MPIContext &&) noexcept = default; + MPIContext &operator=(const MPIContext &) = delete; + MPIContext &operator=(MPIContext &&) noexcept = default; - virtual ~MPIContext() { + virtual ~MPIContext() { #ifdef KR_ENABLE_TRACING int rank = -1; MPI_Comm_rank(m_comm, &rank); @@ -86,43 +96,45 @@ class MPIContext : public ContextBase { MPI_Comm comm() const noexcept { return m_comm; } - Backend &backend() { return m_backend; } - - void register_hashes( - const std::vector &views, - const std::vector &crefs) override { - m_backend.register_hashes(views, crefs); - } + //TODO: Allow configuring global vs local consistency requirements bool restart_available(const std::string &label, int version) override { - return m_backend.restart_available(label, version); + int avail, locally_avail = m_backend->restart_available(label, version); + MPI_Allreduce(&locally_avail, &avail, 1, MPI_INT, MPI_LAND, m_comm); + + return avail; } void restart(const std::string &label, int version, - const std::vector - &views) override { - m_backend.restart(label, version, views); + const std::unordered_set &members) override { + int success = m_backend->restart(label, version, members); + MPI_Allreduce(MPI_IN_PLACE, &success, 1, MPI_INT, MPI_LAND, m_comm); + + //TODO: configurability on this? + // Throwing and using an exception handler seems like it'd + // make for awkwardly-wrapped user code + if(!success) MPI_Abort(m_comm, 1); } void checkpoint(const std::string &label, int version, - const std::vector - &views) override { - m_backend.checkpoint(label, version, views); + const std::unordered_set + &members) override { + //Ignore success status, if we failed we just hope to run long enough to + //succeed next time. + m_backend->checkpoint(label, version, members); } int latest_version(const std::string &label) const noexcept override { - return m_backend.latest_version(label); + int latest = m_backend->latest_version(label); + MPI_Allreduce(MPI_IN_PLACE, &latest, 1, MPI_INT, MPI_MIN, m_comm); + return latest; } - void register_alias( const std::string &original, const std::string &alias ) override { - return m_backend.register_alias( original, alias ); - } +private: - void reset() override { m_backend.reset(); } + void reset_impl() override { } -private: MPI_Comm m_comm; - Backend m_backend; }; } // namespace KokkosResilience diff --git a/src/resilience/StdFileContext.cpp b/src/resilience/context/StdFileContext.cpp similarity index 98% rename from src/resilience/StdFileContext.cpp rename to src/resilience/context/StdFileContext.cpp index 7825301..839604b 100644 --- a/src/resilience/StdFileContext.cpp +++ b/src/resilience/context/StdFileContext.cpp @@ -39,7 +39,7 @@ * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ #include "StdFileContext.hpp" -#include "stdfile/StdFileBackend.hpp" +#include "resilience/stdfile/StdFileBackend.hpp" #include #include diff --git a/src/resilience/StdFileContext.hpp b/src/resilience/context/StdFileContext.hpp similarity index 98% rename from src/resilience/StdFileContext.hpp rename to src/resilience/context/StdFileContext.hpp index 0f0f3c8..9534eb3 100644 --- a/src/resilience/StdFileContext.hpp +++ b/src/resilience/context/StdFileContext.hpp @@ -41,7 +41,7 @@ #ifndef INC_KOKKOS_RESILIENCE_STDFILECONTEXT_HPP #define INC_KOKKOS_RESILIENCE_STDFILECONTEXT_HPP -#include "Context.hpp" +#include "ContextBase.hpp" #include @@ -110,15 +110,16 @@ class StdFileContext : public ContextBase { return m_backend.latest_version(label); } - void reset() override { - m_backend.reset(); - } - void register_alias( const std::string &original, const std::string &alias ) override { } private: + + void reset_impl() override { + m_backend.reset(); + } + std::string m_filename; Backend m_backend; }; diff --git a/src/resilience/context/vt/CMakeLists.txt b/src/resilience/context/vt/CMakeLists.txt new file mode 100644 index 0000000..b513577 --- /dev/null +++ b/src/resilience/context/vt/CMakeLists.txt @@ -0,0 +1,2 @@ +target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/VTContext.cpp + ${CMAKE_CURRENT_LIST_DIR}/ProxyHolder.cpp) diff --git a/src/resilience/context/vt/ProxyHolder.cpp b/src/resilience/context/vt/ProxyHolder.cpp new file mode 100644 index 0000000..2ac2954 --- /dev/null +++ b/src/resilience/context/vt/ProxyHolder.cpp @@ -0,0 +1,256 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#include "common.hpp" +#include "ProxyHolder.hpp" +#include "VTContext.impl.hpp" +#include "ProxyHolder.impl.hpp" + +namespace KokkosResilience::Context::VT { + +ProxyHolder::ProxyHolder() + : data_reg( + custom_registration( + [](std::ostream& stream){return false;}, + [](std::istream& stream){return false;}, + "Invalid VTProxyHolder" + ) + ) +{ + assert(false && "VTProxyHolder default constructor exists for" \ + "Magistrate serialization, but should not be used!"); +} + +ProxyHolder::~ProxyHolder(){ + ctx->m_backend->deregister_member(metadata_registration()); + ctx->m_backend->deregister_member(data_registration()); + if(listener_id != -1){ + invoke(DEREGISTER_EVENT_LISTENER, listener_id); + } +} + +const std::string& ProxyHolder::label() const { + return data_reg->name; +} + +std::any ProxyHolder::invoke(ProxyAction action){ + return invoker(action, nullptr); +} + +bool ProxyHolder::is_local(){ + return std::any_cast( invoke(CHECK_LOCAL) ); +} + +bool ProxyHolder::is_dynamic(){ + return std::any_cast( invoke(CHECK_DYNAMIC) ); +} + +std::unordered_map& ProxyHolder::deps(){ + wait_if_migrating(); + return _status.deps; +} + +ProxyHolder* ProxyHolder::get_holder(ProxyID index_id) { + assert(index_id.proxy_bits == this->proxy_bits); + + return std::any_cast( + invoke(GET_HOLDER_AT, index_id.index_bits) + ); +} + +vt::EpochType ProxyHolder::fetch_status(const std::string& epoch_label){ + if(fetch_epoch == vt::no_epoch){ + fetch_epoch = vt::theTerm()->makeEpochRooted(epoch_label); + vt::theMsg()->pushEpoch(fetch_epoch); + invoke(FETCH_STATUS); + vt::theMsg()->popEpoch(fetch_epoch); + vt::theTerm()->finishedEpoch(fetch_epoch); + vt::theTerm()->addAction(fetch_epoch, [&]{ + fetch_epoch = vt::no_epoch; + }); + } + + return fetch_epoch; +} + +void ProxyHolder::tracked(bool new_tracked) { + wait_if_migrating(); + _status.tracked = new_tracked; + invoke(SET_TRACKED, new_tracked); +} + +bool ProxyHolder::tracked() { + wait_if_migrating(); + return _status.tracked && !_status.deleted; +} + +void ProxyHolder::deleted(bool new_deleted) { + wait_if_migrating(); + _status.deleted = new_deleted; +} + +bool ProxyHolder::deleted(){ + wait_if_migrating(); + return _status.deleted; +} + + +void ProxyHolder::checkpointed_version(int version){ + wait_if_migrating(); + _status.checkpointed_version = version; + invoke(SET_CHECKPOINTED_VERSION, version); +} + +int ProxyHolder::checkpointed_version(){ + wait_if_migrating(); + return _status.checkpointed_version; +} + + +void ProxyHolder::restarted_version(int version){ + wait_if_migrating(); + invoke(SET_RESTARTED_VERSION, version); +} + +int ProxyHolder::restarted_version(){ + wait_if_migrating(); + return _status.restarted_version; +} + + +void ProxyHolder::modified(){ + invoke(MODIFY); +} + +void ProxyHolder::deregistered(std::string region_label){ + wait_if_migrating(); + invoke(DEREGISTER, region_label); +} + +void ProxyHolder::registered(std::string region_label){ + wait_if_migrating(); + invoke(REGISTER, region_label); +} + + +void ProxyHolder::check_missing(std::unordered_set* missing){ + invoke(CHECK_MISSING, missing); +} + +ProxyStatus ProxyHolder::get_status(){ + wait_if_migrating(); + ProxyStatus to_ret = _status; + to_ret.deps.clear(); + return to_ret; +} + +void ProxyHolder::set_status(const ProxyStatus& new_status){ + _status.tracked = new_status.tracked; + _status.deleted = new_status.deleted; + _status.checkpointed_version = new_status.checkpointed_version; + _status.restarted_version = new_status.restarted_version; +} + +void ProxyHolder::migrated_out(){ + if(migrate_in_epoch != vt::no_epoch){ + //updated status info never got to us, + //so we don't send ours and instead let + //the prior owner's message pass on along + vt::theTerm()->releaseLocalDependency(migrate_in_epoch); + migrate_in_epoch = vt::no_epoch; + return; + } + //Send our status info to the new location. + invoke(MIGRATE_STATUS, _status); +} + +void ProxyHolder::migrated_in(){ + if(got_status_before_migrate){ + got_status_before_migrate = false; + return; + } + + assert(migrate_in_epoch == vt::no_epoch); + migrate_in_epoch = vt::theTerm()->makeEpochRooted( + fmt::format("{} awaiting status after migrating in", *this) + ); + + vt::theTerm()->addLocalDependency(migrate_in_epoch); + vt::theTerm()->finishedEpoch(migrate_in_epoch); +} + +void ProxyHolder::migrated_status(const ProxyStatus& new_status){ + _status = new_status; + + if(migrate_in_epoch == vt::no_epoch){ + got_status_before_migrate = true; + } else { + vt::theTerm()->releaseLocalDependency(migrate_in_epoch); + migrate_in_epoch = vt::no_epoch; + } +} + +void ProxyHolder::wait_if_migrating(){ + if(migrate_in_epoch != vt::no_epoch){ + vt::runSchedulerThrough(migrate_in_epoch); + } +} + +Registration ProxyHolder::data_registration(){ + return data_reg; +} + +Registration ProxyHolder::metadata_registration(){ + using namespace checkpoint; + return custom_registration( + [this](std::ostream& stream) mutable { + serializeToStream(*this, stream); + return bool(stream); + }, + [this](std::istream& stream) mutable { + deserializeInPlaceFromStream(stream, this); + return bool(stream); + }, + label() + "meta" + ); +} + +} diff --git a/src/resilience/context/vt/ProxyHolder.hpp b/src/resilience/context/vt/ProxyHolder.hpp new file mode 100644 index 0000000..161cc84 --- /dev/null +++ b/src/resilience/context/vt/ProxyHolder.hpp @@ -0,0 +1,243 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYHOLDER_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYHOLDER_HPP + +#include +#include +#include +#include //std::function + +#include +#include "./common.hpp" +#include "resilience/registration/Registration.hpp" + +namespace KokkosResilience::Context::VT { + +//Magistrate trait for serializing ProxyStatus dependency info or not +struct BasicCheckpointTrait {}; + +struct ProxyStatus { + bool tracked = true; + bool deleted = false; + + int checkpointed_version = -1; + int restarted_version = -1; + + //dependency ID -> minimum version + std::unordered_map deps; + + template + void serialize(SerT& s) { + s | tracked | deleted | checkpointed_version; + + if(!s.hasTraits(BasicCheckpointTrait()) && tracked && !deleted) { + if(s.isUnpacking()) deps.clear(); + s | deps; + } + } +}; + + +/** + * Type-erasure tool that also holds checkpoint/restart status info. + * + * Note that status info is only maintained at whichever node the proxy + * currently resides at, but the holder is valid anywhere. This means prior + * to using any status info, you must either confirm that the proxy is local + * or fetch the status and wait on the returned epoch. + * + * Functions denoted local are always local, else any given function may + * involve message passing. + * + */ + +class ProxyHolder : public ProxyID { +public: + //Not to be used, for Magistrate cooperation only. + ProxyHolder(); + + template + ProxyHolder(ProxyT proxy, VTContext& context); + + //Should not be copied + ProxyHolder(ProxyHolder&) = delete; + + ~ProxyHolder(); + + //Local. + const std::string& label() const; + + //Local. Collections/ObjGroups are never local. + bool is_local(); + //Local. Check if element belongs to a dynamic Collection. + bool is_dynamic(); + + //Local. Construct holder to another member of my group + ProxyHolder* get_holder(ProxyID index_id); + + + + //Fetches updated status, aggregates multiple calls during the window + //of the first. Does not update dependency information. + vt::EpochType fetch_status(const std::string& epoch_label); + + //Should/shouldn't checkpoint data + void tracked(bool); + bool tracked(); //Local. + + //Should delete if present during recovery + //Deleted proxies are untracked, but restore to + //previous state on undeleted + void deleted(bool); + bool deleted(); //Local. + + void checkpointed_version(int version); + int checkpointed_version(); //Local. + + void restarted_version(int version); + int restarted_version(); //Local. + + std::unordered_map& deps(); //Local. + + + + //This proxy will be checkpointed alongside (but separately from) + //the next checkpoint of a region. Implies deleted(false) + void modified(); + + //Notify that this proxy was (de)registered by user. + //Implies modified, but may update tracked/untracked state first. + void registered(std::string region_label); + void deregistered(std::string region_label); + + + //Handle proxy having been migrated. + void migrated_out(); + void migrated_in(); + + //Local. Set new status after migrating in. Some function calls on this + //will be blocked between migrated_in and migrated_status calls + void migrated_status(const ProxyStatus& new_status); + + + //Asynchronously check if this collection element exists, + //and if not add to the set + void check_missing(std::unordered_set* missing); + + + //Registration for the actual proxy data, obeying tracked/untracked/deleted state + Registration data_registration(); + //Registration for just the metadata + Registration metadata_registration(); + + template + void serialize(SerializerT& s); + +protected: + friend VTContext; + + //Invoke a ProxyAction on typed proxy this holds. + //Generally, the member functions above should be + //prefered to directly invoking - std::any_cast + //is inflexible regarding type conversion. + template + std::any invoke(ProxyAction action, ArgT&& arg){ + return invoker(action, arg); + }; + std::any invoke(ProxyAction action); + + //Managed by VTContext, not here. + vt::EpochType restart_epoch = vt::no_epoch; + + //Both local and do not include/change dependency info + // (since deps are only used by the local node) + ProxyStatus get_status(); + void set_status(const ProxyStatus& new_status); + + ProxyStatus _status; +private: + //If element was migrated in and is still being initialized, wait + void wait_if_migrating(); + + //Registration which handles serializing according to state. + Registration data_reg; + + template + Registration build_registration(ProxyT proxy); + + + //Type-erasure lambda. + std::function invoker; + + VTContext* ctx; + + vt::EpochType fetch_epoch = vt::no_epoch; + vt::EpochType migrate_in_epoch = vt::no_epoch; + bool got_status_before_migrate = false; + + //ID for collection event listener registration. + int listener_id = -1; +}; + +template +Registration ProxyHolder::build_registration(ProxyT proxy) { + return custom_registration( + [this, proxy](std::ostream& stream) { + if(is_element() && tracked()){ + checkpoint::serializeToStream(proxy, stream); + } + return bool(stream); + }, + [this, proxy](std::istream& stream) { + if(is_element() && tracked()){ + checkpoint::deserializeInPlaceFromStream(stream, &proxy); + } + return bool(stream); + }, + proxy_label(proxy) + "_impl" + ); +} + +} + +#endif diff --git a/src/resilience/context/vt/ProxyHolder.impl.hpp b/src/resilience/context/vt/ProxyHolder.impl.hpp new file mode 100644 index 0000000..b4aebb0 --- /dev/null +++ b/src/resilience/context/vt/ProxyHolder.impl.hpp @@ -0,0 +1,89 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYHOLDER_IMPL_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYHOLDER_IMPL_HPP + +#include "ProxyHolder.hpp" + +namespace KokkosResilience::Context::VT { +template +ProxyHolder::ProxyHolder(ProxyT proxy, VTContext& context) + : ProxyID(proxy), + data_reg(build_registration(proxy)), + ctx(&context) +{ + invoker = [this, proxy](ProxyAction action, std::any arg) { + return ctx->action_handler(proxy, *this, action, arg); + }; + ctx->m_backend->register_member(metadata_registration()); + ctx->m_backend->register_member(data_registration()); + + if constexpr(is_col::value and not is_elm::value){ + using ObjT = typename elm_type::type; + + using EventT = vt::vrt::collection::listener::ElementEventEnum; + using IndexT = typename ObjT::IndexType; + listener_id = vt::theCollection()->registerElementListener( + proxy_bits, + [this, proxy](EventT event, IndexT index, vt::NodeType elm_home){ + ctx->handle_element_event(proxy[index], event); + return; + } + ); + } +}; + +template +void ProxyHolder::serialize(SerT& s){ + [[maybe_unused]] const auto old_proxy_bits = proxy_bits; + s | proxy_bits | _status; + assert(old_proxy_bits == proxy_bits); + +/*if(!s.hasTraits(BasicCheckpointTrait()) && (s.isPacking() || s.isUnpacking())) + fmt::print("{}: {} status {} to version {}. Tracked: {}\n", ctx->m_proxy, *this, s.isPacking() ? "Packed" : "Unpacked", _status.checkpointed_version, tracked()); +if(s.hasTraits(BasicCheckpointTrait()) && (s.isPacking() || s.isUnpacking())) + fmt::print("{}: {} basic status {} to version {}\n", ctx->m_proxy, *this, s.isPacking() ? "Packed" : "Unpacked", _status.checkpointed_version);*/ +} + +} + +#endif diff --git a/src/resilience/context/vt/ProxyMap.hpp b/src/resilience/context/vt/ProxyMap.hpp new file mode 100644 index 0000000..9615dfc --- /dev/null +++ b/src/resilience/context/vt/ProxyMap.hpp @@ -0,0 +1,103 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYMAP_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYMAP_HPP + +#include +#include "common.hpp" +#include "ProxyHolder.hpp" + +namespace KokkosResilience::Context::VT { + +class ProxyMap { +public: + ProxyMap(VTContext& context) : ctx(context) {}; + + //Get existing, or initialize and get proxy holder for this proxy. + template < + typename ProxyT, + typename enable = typename is_proxy::type + > + ProxyHolder& operator[](ProxyT proxy); + + //Get existing, or attempt to initialize. May return nullptr; + ProxyHolder* operator[](ProxyID proxy_id){ + auto iter = id_to_holder.find(proxy_id); + if(iter != id_to_holder.end()) return &(iter->second); + + auto group_iter = group_to_member_id.find(proxy_id.proxy_bits); + if(group_iter == group_to_member_id.end()) return nullptr; + + auto& group_holder = id_to_holder[group_iter->second]; + return group_holder.get_holder(proxy_id); + } + + //If a registration's hash matches a held proxy's, return pointer to it. + ProxyHolder* operator[](Registration& reg){ + auto iter = hash_to_id.find(reg.hash()); + if(iter == hash_to_id.end()) return nullptr; + + return (*this)[iter->second]; + } + + void add_reg_mapping(size_t reg_hash, ProxyID proxy_id){ + hash_to_id[reg_hash] = proxy_id; + } + + std::unordered_map& map(){ + return id_to_holder; + } + +private: + VTContext& ctx; + + std::unordered_map id_to_holder; + + //Find a proxy ID from the hash of its (core) registration. + std::unordered_map hash_to_id; + + //Find a representative member of a group by its proxy bits + std::unordered_map group_to_member_id; +}; + +} +#endif diff --git a/src/resilience/Cref.hpp b/src/resilience/context/vt/ProxyMap.impl.hpp similarity index 66% rename from src/resilience/Cref.hpp rename to src/resilience/context/vt/ProxyMap.impl.hpp index afba39d..79ab03d 100644 --- a/src/resilience/Cref.hpp +++ b/src/resilience/context/vt/ProxyMap.impl.hpp @@ -38,47 +38,27 @@ * * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ -#ifndef INC_RESILIENCE_CREF_HPP -#define INC_RESILIENCE_CREF_HPP -#include +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYMAP_IMPL_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_PROXYMAP_IMPL_HPP -namespace KokkosResilience -{ - namespace Detail - { - struct CrefImpl - { - CrefImpl( void *p, std::size_t s, std::size_t n, const char *_name ) - : ptr( p ), sz( s ), num( n ), name( _name ) - {} +#include "ProxyMap.hpp" +#include "VTContext.hpp" - void *ptr; - std::size_t sz; - std::size_t num; - const char *name; - }; +namespace KokkosResilience::Context::VT { - struct Cref : public CrefImpl - { - using CrefImpl::CrefImpl; +template +ProxyHolder& ProxyMap::operator[](ProxyT proxy){ + auto iter = id_to_holder.find(proxy); + if(iter == id_to_holder.end()){ + iter = id_to_holder.try_emplace(proxy, proxy, ctx).first; + ctx.init_holder(proxy, iter->second); - Cref( const Cref &_other ) - : CrefImpl( _other.ptr, _other.sz, _other.num, _other.name ) - { - if ( check_ref_list ) - check_ref_list->emplace_back( ptr, sz, num, name ); - } - - static std::vector< CrefImpl > *check_ref_list; - }; + group_to_member_id.emplace(ProxyID(proxy).proxy_bits, proxy); } + return iter->second; +} - template< typename T > - auto check_ref( T &_t, const char *_str ) - { - return Detail::Cref{ reinterpret_cast< void * >( &_t ), sizeof( T ), 1, _str }; - } } -#endif // INC_RESILIENCE_CREF_HPP +#endif diff --git a/src/resilience/context/vt/VTContext.cpp b/src/resilience/context/vt/VTContext.cpp new file mode 100644 index 0000000..2b5cbbf --- /dev/null +++ b/src/resilience/context/vt/VTContext.cpp @@ -0,0 +1,395 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#include "VTContext.impl.hpp" + +namespace KokkosResilience { + +std::unique_ptr< ContextBase > +make_context( vt::ctx::Context* vt_theCtx, const std::string &config ) +{ + return std::make_unique(config); +} + +} + +namespace KokkosResilience::Context::VT { + +vt::trace::UserEventIDType VTContext::checkpoint_region = vt::trace::no_user_event_id; +vt::trace::UserEventIDType VTContext::checkpoint_wait = vt::trace::no_user_event_id; +vt::trace::UserEventIDType VTContext::serialize_proxy = vt::trace::no_user_event_id; +vt::trace::UserEventIDType VTContext::offset_region = vt::trace::no_user_event_id; +vt::trace::UserEventIDType VTContext::offset_wait = vt::trace::no_user_event_id; + +VTContext::VTContext(const std::string& config_file) + : ContextBase(config_file, vt::theContext()->getNode()), + holders(*this), + phase_begin_hookid(vt::thePhase()->registerHookRooted( + vt::phase::PhaseHook::Start, + [this](){ + in_phase = true; + } + )), + phase_end_hookid(vt::thePhase()->registerHookCollective( + vt::phase::PhaseHook::EndPostMigration, + [this](){ + in_phase = false; + handle_pending_events(); + } + )), + max_iteration_offset(get_config_max_iter_offset()), + contexts_proxy(vt::theObjGroup()->makeCollective(this, "kr::VTContext")) +{ + checkpoint_region = vt::trace::registerEventCollective("CheckpointRegion"); + checkpoint_wait = vt::trace::registerEventCollective("CheckpointWait"); + serialize_proxy = vt::trace::registerEventCollective("SerializeProxy"); + offset_region = vt::trace::registerEventCollective("OffsetIterRegion"); + offset_wait = vt::trace::registerEventCollective("OffsetIterWait"); +} + +VTContext::~VTContext() { + vt::thePhase()->unregisterHook(phase_begin_hookid); + vt::thePhase()->unregisterHook(phase_end_hookid); + vt::theObjGroup()->destroyCollective(contexts_proxy); +} + +void VTContext::restart(const std::string& label, int version, + const std::unordered_set& members) { + m_backend->restart(label, version, members); + restart_proxies(label, version); +} + +void VTContext::checkpoint(const std::string& label, int version, + const std::unordered_set& members) { + if(checkpoint_epoch != vt::no_epoch){ + vt::trace::TraceScopedNote trace_obj(fmt::format("previous checkpoint wait {}@{}", label, version), checkpoint_wait); + vt::runSchedulerThrough(checkpoint_epoch); + trace_obj.end(); + } + checkpoint_epoch = vt::no_epoch; + + //Construct checkpoint_epoch after offset iter finised + vt::theTerm()->addAction(offset_iter_epoch, [&, parent_epoch = vt::theTerm()->getEpoch()](){ + vt::theTerm()->pushEpoch(parent_epoch); + checkpoint_proxies(label, version); + vt::theTerm()->popEpoch(parent_epoch); + }); + { + vt::trace::TraceScopedNote trace_obj(fmt::format("offset iter wait {}@{}", label, version), offset_wait); + vt::runSchedulerThrough(offset_iter_epoch); + trace_obj.end(); + } + offset_iter_epoch = vt::no_epoch; + + //Make sure action had a chance to run, then wait for checkpoint_epoch + if(checkpoint_epoch == vt::no_epoch){ + vt::theSched()->runSchedulerWhile([this](){ return checkpoint_epoch == vt::no_epoch; }); + } + if(max_iteration_offset == 0 || true){ + vt::trace::TraceScopedNote trace_obj(fmt::format("checkpoint wait {}@{}", label, version), checkpoint_wait); + vt::runSchedulerThrough(checkpoint_epoch); + checkpoint_epoch = vt::no_epoch; + trace_obj.end(); + } + + //TODO: Re-enable this when needed for non-proxy members + //double region_start_s = vt::timing::getCurrentTime(); + //vt::runSchedulerThrough(region_epochs.front()); + //fmt::print("Waiting for region took {}s\n", vt::timing::getCurrentTime() - region_start_s); + + m_backend->checkpoint(label, version, members); + + //TODO: This should be handled better when using offset iterations. + modified_proxies.clear(); +} + +void VTContext::register_member(Registration member, Region region){ + ProxyHolder* holder = holders[member]; + if(holder) { + holder->registered(region.label()); + + //Don't complete registration if non-local, + //holder will have sent off for locally registering it. + if(holder->is_element() && !holder->is_local()) return; + } + + //Now just do default registration. + ContextBase::register_member(member, region); +} + +void VTContext::deregister_member(Registration member, Region region){ + const bool erased = region.members().erase(member); + const int n_reg = count_registrations(member); + + if(erased && n_reg == 0){ + //If last local registration gone, deregister from backend. + m_backend->deregister_member(member); + } + + if(n_reg == 0){ + //If last local gone, or no locals here in the first place, + //mark proxy as deregistered + ProxyHolder* holder = holders[member]; + if(holder){ + holder->deregistered(region.label()); + } + } +} + +void VTContext::enter_region(Region region, int version){ + if((*active_filter)(version + max_iteration_offset)){ + assert(offset_iter_epoch == vt::no_epoch); + + offset_iter_epoch = vt::theTerm()->makeEpochCollective( + fmt::format("kr:: {}@{} application work", region.label(), version) + ); + + auto trace_obj = std::make_shared( + fmt::format("offset iter region {}@{}", region.label(), version), + offset_region + ); + vt::theTerm()->addAction(offset_iter_epoch, [trace_obj](){ + trace_obj->end(); + }); + + vt::theMsg()->pushEpoch(offset_iter_epoch); + } +} + +void VTContext::exit_region(Region region, int version){ + if((*active_filter)(version + max_iteration_offset)){ + vt::theMsg()->popEpoch(offset_iter_epoch); + vt::theTerm()->finishedEpoch(offset_iter_epoch); + } +} + +size_t VTContext::get_config_max_iter_offset(){ + //Default: each region ran in full immediately after exiting + int max_offset = 0; + + const auto& cfg = this->config()["contexts"]["vt"]; + auto usr_input = cfg.get("max_iteration_offset"); + + if(usr_input){ + max_offset = static_cast(usr_input->template as()); + if(vt::theContext()->getNode() == 0) fmt::print("kr::VTContext running with max {} iterations offset\n", max_offset); + } + + return max_offset; +} + +void VTContext::handle_pending_events(){ + while(!pending_element_events.empty()){ + auto info = pending_element_events.front(); + pending_element_events.pop_front(); + + auto holder_ptr = holders[info.first]; + assert(holder_ptr != nullptr); + auto& holder = *holder_ptr; + + using Event = vt::vrt::collection::listener::ElementEventEnum; + Event event = info.second; + + handle_element_event(holder, event); + } +} + +void VTContext::handle_element_event( + ProxyID proxy, + vt::vrt::collection::listener::ElementEventEnum event +) { + auto holder_ptr = holders[proxy]; + assert(holder_ptr != nullptr); + auto& holder = *holder_ptr; + + //fmt::print(stderr, "{} handling {} for {}.\n", m_proxy, event, holder); + + using Event = vt::vrt::collection::listener::ElementEventEnum; + switch(event){ + case Event::ElementCreated: + holders[holder.get_group_id()]->deps()[holder] = -1; + break; + + case Event::ElementDestroyed: + holder.deleted(); + holder.modified(); + break; + + case Event::ElementMigratedOut: + holders[holder.get_group_id()]->deps().erase(holder); + if(in_phase) pending_element_events.push_back(std::make_pair(proxy, event)); + else holder.migrated_out(); + break; + + case Event::ElementMigratedIn: + holders[holder.get_group_id()]->deps()[holder] = -1; + if(in_phase) pending_element_events.push_back(std::make_pair(proxy, event)); + else holder.migrated_in(); + break; + } +} + +void VTContext::checkpoint_proxies(const std::string& region, int version){ + assert(checkpoint_epoch == vt::no_epoch); + + checkpoint_epoch = vt::theTerm()->makeEpochCollective( + fmt::format("{} checkpoint wrapper {}@{}", m_proxy, region, version) + ); + + auto trace_obj = std::make_shared( + fmt::format("Checkpoint region {}@{}", region, version), + checkpoint_region + ); + vt::theTerm()->addAction(checkpoint_epoch, [trace_obj](){ + trace_obj->end(); + }); + + vt::theTerm()->pushEpoch(checkpoint_epoch); + + auto epoch_fmt = + fmt::format("{} checkpoint {}@{} gather dependencies of {}", m_proxy, region, version, "{}"); + + auto fetch_epoch_fmt = + fmt::format("{} checkpoint {}@{} fetch status of {}", m_proxy, region, version, "{}"); + + //Fetch status of dependencies for any elements being + //checkpointed, then checkpoint + for(auto& modified_proxy : modified_proxies){ + auto& holder = *holders[modified_proxy]; + + vt::EpochType deps_epoch = vt::theTerm()->makeEpochRooted( + fmt::format(epoch_fmt, holder) + ); + + for(auto& dep : holder.deps()){ + auto* dep_holder_ptr = holders[dep.first]; + assert(dep_holder_ptr != nullptr); + auto& dep_holder = *dep_holder_ptr; + + //Only need to fetch non-local dependencies. + if(dep_holder.is_local()) continue; + auto fetch_epoch = dep_holder.fetch_status( + fmt::format(fetch_epoch_fmt, dep_holder) + ); + vt::theTerm()->addDependency(fetch_epoch, deps_epoch); + } + vt::theTerm()->finishedEpoch(deps_epoch); + + + vt::theTerm()->addLocalDependency(checkpoint_epoch); + + //Once fetches are done, update metadata and checkpoint + vt::theTerm()->addAction(deps_epoch, [this, &holder](){ + if(max_iteration_offset == 0){ + vt::theSched()->enqueue([this, &holder](){ + checkpoint_proxy(holder, checkpoint_epoch); + }); + } else { + vt::theSched()->enqueue([this, &holder](){ + vt::theSched()->getThreadManager()->allocateThreadRun( + [this, &holder](){ + checkpoint_proxy(holder, checkpoint_epoch); + + auto thread_id = vt::sched::ThreadAction::getActiveThreadID(); + vt::theSched()->enqueue([thread_id](){ + vt::theSched()->getThreadManager()-> + deallocateThread(thread_id); + }); + }); + }); + } + }); + } + + vt::theTerm()->popEpoch(checkpoint_epoch); + vt::theTerm()->finishedEpoch(checkpoint_epoch); +} + +void VTContext::checkpoint_proxy(ProxyHolder& holder, vt::EpochType epoch){ + vt::theTerm()->pushEpoch(epoch); + + for(auto& dep : holder.deps()){ + auto* dep_holder_ptr = holders[dep.first]; + assert(dep_holder_ptr != nullptr); + auto& dep_holder = *dep_holder_ptr; + + dep.second = dep_holder.checkpointed_version(); + + if(holder.checkpointed_version() != dep_holder.checkpointed_version()) + fmt::print("Warning: kr:: {}@{} depends on {}@{}\n", holder, holder.checkpointed_version(), + dep_holder, dep_holder.checkpointed_version()); + } + + const bool is_global = holder.is_element(); + //Individual elements are checkpointed once, globally. + //Collections have their dependency info checkpointed per-rank + m_backend->checkpoint( + holder.label(), + holder.checkpointed_version(), + {holder.metadata_registration(), holder.data_registration()}, + is_global + ); + + vt::theTerm()->popEpoch(epoch); + vt::theTerm()->releaseLocalDependency(epoch); +} + + +void VTContext::restart_proxies(const std::string& region_label, const int version) { + //Recover anything marked with a checkpointed_version set during + // the default member recovery step, and traverse dependencies. + //We send all messages required for configuring version info, + // then actual restarts are handled as actions attached to the + // operation epoch's finish. + + const std::string label = fmt::format("Restart {}@{}", region_label, version); + vt::runInEpochCollective( label, [&]{ + for(auto& holder_iter : holders.map()){ + auto& holder = holder_iter.second; + + if(holder.checkpointed_version() > holder.restarted_version()) { + holder.restarted_version(holder.checkpointed_version()); + } + } + }); +} + +} diff --git a/src/resilience/context/vt/VTContext.hpp b/src/resilience/context/vt/VTContext.hpp new file mode 100644 index 0000000..a5328f1 --- /dev/null +++ b/src/resilience/context/vt/VTContext.hpp @@ -0,0 +1,224 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_VTCONTEXT_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_VTCONTEXT_HPP + +#include +#include +#include +#include +#include +#include + +#include + + +#include "resilience/context/ContextBase.hpp" +#include "resilience/Config.hpp" +#include "common.hpp" +#include "ProxyHolder.hpp" +#include "ProxyMap.hpp" + +namespace KokkosResilience::Context::VT { + class VTContext : public ContextBase { + public: + explicit VTContext(const std::string& config_file); + + VTContext(const VTContext &) = delete; + VTContext(VTContext &&) noexcept = default; + + VTContext &operator=(const VTContext &) = delete; + VTContext &operator=(VTContext &&) noexcept = default; + + virtual ~VTContext(); + + bool restart_available(const std::string &label, int version) override { + //TODO: Also recover this version's required proxy versions and + // verify they're available? + return m_backend->restart_available(label, version); + } + + + void restart(const std::string &label, int version, + const std::unordered_set &members) override; + + void checkpoint(const std::string &label, int version, + const std::unordered_set &members) override; + + int latest_version(const std::string &label) const noexcept override { + return m_backend->latest_version(label); + } + + void reset_impl() override { /* TODO */ } + + void register_member(Registration member, Region region) override; + + void deregister_member(Registration member, Region region) override; + + void enter_region(Region region, int version) override; + + void exit_region(Region region, int version) override; + + //Keep a record of recent VTProxy Registrations to proxy ID, + //for managing (de)registering members that are actually proxies. + template + void add_reg_mapping(size_t hash, T proxy); + + template + ProxyHolder& get_holder(T proxy); + + private: + //Register as a ContextBase member + template + void register_proxy(ProxyT proxy, std::string& region_label); + template + void deregister_proxy(ProxyT proxy, std::string& region_label); + + //Checkpoint/recover actual data and dependencies of proxies + //populates checkpoint_epoch + void checkpoint_proxies(const std::string& label, const int version); + void checkpoint_proxy(ProxyHolder& holder, vt::EpochType epoch); + + //Recursively traverse proxy and its dependencies + template + void restart_proxy(ProxyT proxy, ProxyHolder& holder); + void restart_proxies(const std::string& label, const int version); + + size_t get_config_max_iter_offset(); + + protected: + friend ProxyHolder; + friend ProxyMap; + + template + void init_holder(ProxyT proxy, ProxyHolder& holder); + + template + std::any action_handler( + ProxyT proxy, + ProxyHolder& holder, + ProxyAction action, + std::any arg, + bool remote_request = false + ); + + + private: + //If ProxyT is an element, send to element to find correct local context + //Else, broadcast to contexts. + template + void send_action( + ProxyT proxy, + ProxyAction action, + const ArgT& arg = {} + ); + + //Or send to a specific rank's context + template + void send_action( + int dest, + ProxyT proxy, + ProxyAction action, + const ArgT& arg = {} + ); + + public: + + //Handle action addressed to element, passing back to action_handler + template + static void remote_action_handler( + ObjT* unused, + VTContextProxy ctx_proxy, + ProxyT proxy, + ProxyAction action, + ArgT arg + ); + //As above, but addressed to context + template + void remote_action_handler( + ProxyT proxy, + ProxyAction action, + ArgT arg + ); + + static vt::trace::UserEventIDType checkpoint_region; + static vt::trace::UserEventIDType checkpoint_wait; + static vt::trace::UserEventIDType serialize_proxy; + static vt::trace::UserEventIDType offset_region; + static vt::trace::UserEventIDType offset_wait; + + protected: + //Handle element migrations/insertions/deletions + using ElementEvent = vt::vrt::collection::listener::ElementEventEnum; + void handle_element_event(ProxyID proxy, ElementEvent event); + + private: + //Handle elements from during a phase that had to have handling delayed. + void handle_pending_events(); + + ProxyMap holders; + + //Local proxies known to have been changed since last checkpoint + std::unordered_set modified_proxies; + + //Delay handling migrations due to phase balancing until phase ends. + std::deque> pending_element_events = {}; + + vt::phase::PhaseHookID phase_begin_hookid, phase_end_hookid; + bool in_phase = false; + + + //Epoch for all proxy element checkpoints finished + vt::EpochType checkpoint_epoch = vt::no_epoch; + + //Epoch for offset iter, checkpoint_epoch not guaranteed created until + //this is finished + const size_t max_iteration_offset = 0; + vt::EpochType offset_iter_epoch = vt::no_epoch; + + VTContextProxy contexts_proxy; + VTContextElmProxy m_proxy = contexts_proxy[m_pid]; + }; +} + +#include "VTContext.impl.hpp" +#endif // INC_KOKKOS_RESILIENCE_VTCONTEXT_HPP diff --git a/src/resilience/context/vt/VTContext.impl.hpp b/src/resilience/context/vt/VTContext.impl.hpp new file mode 100644 index 0000000..515c585 --- /dev/null +++ b/src/resilience/context/vt/VTContext.impl.hpp @@ -0,0 +1,543 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_VTCONTEXT_IMPL_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_VTCONTEXT_IMPL_HPP + +#include +#include + +#include "common.hpp" + +#include "VTContext.hpp" + +#include "ProxyHolder.impl.hpp" +#include "ProxyMap.impl.hpp" + +namespace KokkosResilience::Context::VT { + template + void VTContext::init_holder(ProxyT proxy, ProxyHolder& holder){ + if constexpr(not is_elm::value) { + //For groups, each node marks its local elements as dependencies + if constexpr(is_obj::value){ + auto local_elm = proxy[vt::theContext()->getNode()]; + holder.deps()[holders[local_elm]] = -1; + } else { + for(auto index : vt::theCollection()->getLocalIndices(proxy)){ + auto local_elm = proxy[index]; + holder.deps()[holders[local_elm]] = -1; + } + } + } + } + + template + void VTContext::add_reg_mapping(size_t hash, ProxyT proxy){ + holders.add_reg_mapping(hash, proxy); + } + + template + ProxyHolder& VTContext::get_holder(ProxyT proxy){ + return holders[proxy]; + } + + template + void VTContext::restart_proxy(ProxyT proxy, ProxyHolder& holder){ + //Restart the holder's metadata for the current version + constexpr bool IS_GLOBAL = is_elm::value; + m_backend->restart( + holder.label(), + holder.restarted_version(), + {holder.metadata_registration(), holder.data_registration()}, + IS_GLOBAL + ); + + if(!holder.tracked()) return; + + bool _check_missing = false; + if constexpr((not is_elm::value) and is_col::value) { + //Dynamic collections may have elements that we need to reinsert into the + //collection before we can recover them. + _check_missing = vt::theCollection()->getDynamicMembership< + typename elm_type::type + >(holder.proxy_bits); + } + const bool check_missing = _check_missing; + std::shared_ptr> missing_elms = std::make_shared>(); + + vt::EpochType check_missing_epoch; + if(check_missing) { + //TODO: How do I know asynchronous collective epoch is matched? Is the label enough? + check_missing_epoch = vt::theTerm()->makeEpochCollective( + fmt::format("Detect missing elements of {} @{}", holder, holder.restarted_version()) + ); + } + + + //Go through dependencies and tell them this minimum required version. + for(auto dep_iter : holder.deps()){ + ProxyHolder* dep_holder = holders[dep_iter.first]; + + if(dep_holder == nullptr) { + fmt::print(stderr, "WARNING: could not find {} which {} depends upon!\n", + dep_iter.first, holder); + continue; + } + + const bool dep_local = dep_holder->is_local(); + + if(!dep_local || (dep_holder->restarted_version() < dep_iter.second)){ + dep_holder->restarted_version(dep_iter.second); + } + + if(check_missing && + dep_holder->proxy_bits == holder.proxy_bits && + !dep_local){ + vt::theMsg()->pushEpoch(check_missing_epoch); + dep_holder->check_missing(missing_elms.get()); + vt::theMsg()->popEpoch(check_missing_epoch); + } + } + + if(check_missing){ + vt::theTerm()->addAction(check_missing_epoch, [proxy, missing_elms]{ + if constexpr(is_col::value and not is_elm::value){ + auto token = proxy.beginModification(); + for(auto& elm : *(missing_elms.get())){ + reindex(proxy, elm.index_bits).insert(token); + } + proxy.finishModification(std::move(token)); + } + }); + vt::theTerm()->finishedEpoch(check_missing_epoch); + } + + + //Groups just need to update metadata and request versions + //Elements continue on to restart data + if(!holder.is_element()) return; + + //A newer version may have been requested by the time + //all of the updates finish, so register a restart callback + //but only perform if versions match. + const int m_version = holder.restarted_version(); + + vt::theTerm()->addAction(vt::theTerm()->getEpoch(), [&, this, proxy, m_version] { + if(m_version == holder.restarted_version() && holder.tracked()){ + //Finally, restart the data of the proxy element + this->m_backend->restart( + holder.label(), + holder.restarted_version(), + {holder.data_registration()}, + true + ); + } + }); + } + + template + void VTContext::send_action( + ProxyT proxy, + ProxyAction action, + const ArgT& arg + ) { +#ifdef VTCONTEXT_LOG_SENDS + fmt::print(stderr, "{} sends {} to {}\n", m_proxy, action, proxy); +#endif + + if constexpr(Util::VT::is_elm::value){ + using ObjT = typename Util::VT::elm_type::type; + proxy.template send< + &VTContext::remote_action_handler + >(contexts_proxy, proxy, action, arg); + } else { + contexts_proxy.template broadcast< + &VTContext::remote_action_handler + >(proxy, action, arg); + } + } + + template + void VTContext::send_action( + int dest, + ProxyT proxy, + ProxyAction action, + const ArgT& arg + ) { +#ifdef VTCONTEXT_LOG_SENDS + fmt::print(stderr, "{} sends {} to {}\n", m_proxy, action, contexts_proxy[dest]); +#endif + + contexts_proxy[dest].template send< + &VTContext::remote_action_handler + >(proxy, action, arg); + } + + + template + void VTContext::remote_action_handler( + ObjT* unused, + VTContextProxy ctx_proxy, + ProxyT proxy, + ProxyAction action, + ArgT arg + ) { + VTContext* ctx = vt::theObjGroup()->get(ctx_proxy); +#ifdef VTCONTEXT_LOG_RECEIVES + fmt::print(stderr, "{} recvs {} for {}\n", ctx->m_proxy, action, proxy); +#endif + + constexpr bool REMOTE_REQUEST = true; + ctx->action_handler(proxy, ctx->get_holder(proxy), action, arg, REMOTE_REQUEST); + } + + template + void VTContext::remote_action_handler( + ProxyT proxy, + ProxyAction action, + ArgT arg + ) { +#ifdef VTCONTEXT_LOG_RECEIVES + fmt::print(stderr, "{} recvs {} for {}\n", m_proxy, action, proxy); +#endif + + constexpr bool REMOTE_REQUEST = true; + action_handler(proxy, get_holder(proxy), action, arg, REMOTE_REQUEST); + } + + struct ProxyMigrateInfo { + ProxyStatus status; + bool modified; + std::vector registered_regions; + int new_owner; + + template + void serialize(SerT& s){ + s | status | modified | registered_regions; + } + }; + + template + std::any VTContext::action_handler( + ProxyT proxy, + ProxyHolder& holder, + ProxyAction action, + std::any arg, + bool remote_request + ) { + constexpr bool elm = is_elm::value; + constexpr bool group = !elm; + constexpr bool col = is_col::value; + constexpr bool obj = is_obj::value; + + const bool local = is_local(proxy); + + //Group actions should be broadcasted to all contexts + const bool need_broadcast = group && !remote_request; + //Element actions should be sent to the local context + const bool need_unicast = elm && !local; + + //Other than exclusively-local operations + const bool need_send = need_broadcast || need_unicast; + +#ifdef VTCONTEXT_LOG_EVENTS + fmt::print("{} processing {} on {}. Remote: {}, Broadcast: {}, Unicast: {}\n", + m_proxy, action, proxy, remote_request, need_broadcast, need_unicast); +#endif + + switch(action){ + case GET_HOLDER_AT: + return &holders[reindex(proxy, std::any_cast(arg))]; + + case FETCH_STATUS: + if(!local) { + assert(!remote_request); + //Fetch from the proxy's local context + send_action(proxy, action, int(vt::theContext()->getNode())); + } else if(remote_request) { + //Reply to remote requester after checking that my ready epoch is updated + send_action(std::any_cast(arg), proxy, SET_STATUS, holder.get_status()); + } else { + assert(false); + } + return nullptr; + + case SET_STATUS: + assert(remote_request); + //Might be local if fetch_status interrupted by the element migrating here + if(!local) holder.set_status(std::any_cast(arg)); + return nullptr; + + case CHECK_LOCAL: + assert(!remote_request); + return local; + + case SET_TRACKED: + if(need_unicast) + send_action(proxy, action, std::any_cast(arg)); + else if(need_broadcast && holder.tracked() != std::any_cast(arg)) + send_action(proxy, action, std::any_cast(arg)); + else if(remote_request) + holder.tracked(std::any_cast(arg)); + return nullptr; + + case SET_CHECKPOINTED_VERSION: + if(need_unicast) send_action(proxy, action, std::any_cast(arg)); + else if(remote_request) holder.checkpointed_version(std::any_cast(arg)); + return nullptr; + + case SET_RESTARTED_VERSION: + if(need_send) { + send_action(proxy, action, std::any_cast(arg)); + } else { + int requested_version = std::any_cast(arg); + bool should_update = requested_version > holder.restarted_version(); + if(should_update) { + holder._status.restarted_version = requested_version; + if(need_broadcast){ + send_action(proxy,action, requested_version); + } + + assert(!elm || local); + restart_proxy(proxy, holder); + } + } + return nullptr; + + case MODIFY: + if(need_unicast){ + send_action(proxy, action, nullptr); + } else if(need_broadcast){ + if(modified_proxies.find(proxy) == modified_proxies.end()){ + send_action(proxy, action, nullptr); + } + } else if(modified_proxies.insert(proxy).second) { + holder.checkpointed_version(holder.checkpointed_version()+1); + if constexpr(not elm) { + if constexpr(obj) { + auto& child_holder = holders[proxy[vt::theContext()->getNode()]]; + if(modified_proxies.insert(child_holder).second) + child_holder.checkpointed_version(child_holder.checkpointed_version()+1); + } else { + for(auto index : vt::theCollection()->getLocalIndices(proxy)){ + auto& child_holder = holders[proxy[index]]; + if(modified_proxies.insert(child_holder).second) + child_holder.checkpointed_version(child_holder.checkpointed_version()+1); + } + } + } + } + return nullptr; + + case REGISTER: + if(need_unicast){ + send_action(proxy, action, std::any_cast(arg)); + } else { + if(remote_request) { + assert(elm); + //Register as a member to ContextBase + this->register_to(std::any_cast(arg), proxy, ""); + } + holder.tracked(true); + holder.modified(); + } + return nullptr; + + case DEREGISTER: + if(need_send){ + send_action(proxy, action, std::any_cast(arg)); + } else { + if(remote_request) { + //Deregister as a member to ContextBase + this->deregister_from(std::any_cast(arg), proxy, ""); + } + holder.tracked(false); + holder.modified(); + } + return nullptr; + + case CHECK_DYNAMIC: + if constexpr(col){ + return vt::theCollection()->getDynamicMembership< + typename elm_type::type + >(holder.proxy_bits); + } + return bool(false); + + case CHECK_MISSING: + if constexpr(elm && col) { + if(!local){ + vt::theCollection()->getElementLocation( + proxy, + [proxy, arg](vt::NodeType location){ + if(location == vt::uninitialized_destination){ + std::any_cast*>(arg)->insert(proxy); + } + }, + false + ); + } + } + return nullptr; + + case DEREGISTER_EVENT_LISTENER: + if constexpr(col && !elm) { + using ObjT = typename elm_type::type; + vt::theCollection()->unregisterElementListener( + holder.proxy_bits, std::any_cast(arg) + ); + } else { + assert(false); + } + return nullptr; + + case MIGRATE_STATUS: + assert(col and elm); + if(!local){ + assert(!remote_request); + + ProxyMigrateInfo info; + info.status = std::any_cast(arg); + info.modified = modified_proxies.erase(proxy); + + //Deregister from any ContextBase regions and inform new host of them. + Registration core_reg = create_registration(*this, proxy); + for(auto& region_pair : regions){ + if(region_pair.second.erase(core_reg)){ + info.registered_regions.push_back(region_pair.first); + } + } + if(!info.registered_regions.empty()) m_backend->deregister_member(core_reg); + + send_action(proxy, MIGRATE_STATUS, info); + } else { + assert(remote_request); + ProxyMigrateInfo info = std::any_cast(arg); + holder.migrated_status(info.status); + if(info.modified) modified_proxies.insert(proxy); + + Registration core_reg = create_registration(*this, proxy); + for(std::string& region_label : info.registered_regions){ + regions[region_label].insert(core_reg); + } + if(!info.registered_regions.empty()) m_backend->register_member(core_reg); + } + return nullptr; + } + assert(false); + return nullptr; + } +} + + +//Define how fmt should format some of the things used in logging + +namespace fmt { +template<> +struct formatter + : formatter +{ + using Action = KokkosResilience::Context::VT::ProxyAction; + auto format(const Action& action, format_context& ctx){ + constexpr std::array names = { + KR_VT_PROXY_ACTIONS(KR_VT_ENUM_LIST_STR) + }; + return fmt::format_to(ctx.out(), "{}", names[action]); + } +}; + +template<> +struct formatter<::vt::vrt::collection::listener::ElementEventEnum> + : formatter +{ + using Event = ::vt::vrt::collection::listener::ElementEventEnum; + + static constexpr + std::string_view to_string(const Event& event){ + switch(event) { + case Event::ElementCreated: return "ElementCreated"; + case Event::ElementDestroyed: return "ElementDestroyed"; + case Event::ElementMigratedOut: return "ElementMigratedOut"; + case Event::ElementMigratedIn: return "ElementMigratedIn"; + } + return "Unknown vt::vrt::collection::listener::ElementEventEnum"; + } + + auto format(const Event& event, format_context& ctx){ + return fmt::format_to(ctx.out(), "{}", to_string(event)); + } +}; + +template +struct formatter::type> + : formatter +{ + auto format(ProxyT proxy, format_context& ctx){ + return fmt::format_to(ctx.out(), "{}", KokkosResilience::Util::VT::proxy_label(proxy)); + } +}; + +template<> +struct formatter + : formatter +{ + using Holder = KokkosResilience::Context::VT::ProxyHolder; + auto format(const Holder& holder, format_context& ctx){ + return fmt::format_to(ctx.out(), "{}", holder.label()); + } +}; + +template<> +struct formatter + : formatter +{ + using ProxyID = KokkosResilience::Util::VT::ProxyID; + auto format(const ProxyID& proxy_id, format_context& ctx){ + if(proxy_id.is_element()){ + return fmt::format_to(ctx.out(), "ProxyElement({}[{}])", proxy_id.proxy_bits, proxy_id.index_bits); + } else { + return fmt::format_to(ctx.out(), "Proxy({})", proxy_id.proxy_bits); + } + } +}; +} + +#endif diff --git a/src/resilience/context/vt/common.hpp b/src/resilience/context/vt/common.hpp new file mode 100644 index 0000000..231ad4e --- /dev/null +++ b/src/resilience/context/vt/common.hpp @@ -0,0 +1,95 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_CONTEXT_VT_COMMON_HPP +#define INC_KOKKOS_RESILIENCE_CONTEXT_VT_COMMON_HPP + +#include + +#include +#include "resilience/util/VTUtil.hpp" + +//#define VTCONTEXT_LOG_EVENTS + +namespace KokkosResilience::Context::VT { + using namespace KokkosResilience::Util::VT; + + //Actions available through the VTContext action handler + //Not all actions are valid for all proxy types. + //Use macros to automatically generate to_string +#define KR_VT_PROXY_ACTIONS(f) \ + f(GET_HOLDER_AT),\ + f(FETCH_STATUS),\ + f(SET_STATUS),\ + f(SET_TRACKED),\ + f(SET_CHECKPOINTED_VERSION),\ + f(SET_RESTARTED_VERSION),\ + f(MODIFY),\ + f(REGISTER),\ + f(DEREGISTER),\ + f(CHECK_LOCAL),\ + f(CHECK_DYNAMIC),\ + f(CHECK_MISSING),\ + f(DEREGISTER_EVENT_LISTENER),\ + f(MIGRATE_STATUS) + +#define KR_VT_ENUM_LIST(x) x +#define KR_VT_ENUM_LIST_STR(x) #x + + enum ProxyAction { + KR_VT_PROXY_ACTIONS(KR_VT_ENUM_LIST) + }; + + //Information about checkpoint/recovery state + struct ProxyStatus; + + //Untyped holder with re-typing capabilities. + //Holds ProxyStatus, manages access. + class ProxyHolder; + + class ProxyMap; + + class VTContext; + using VTContextProxy = VTObj; + using VTContextElmProxy = VTObjElm; +} + +#endif diff --git a/src/resilience/registration/CMakeLists.txt b/src/resilience/registration/CMakeLists.txt new file mode 100644 index 0000000..dca03bc --- /dev/null +++ b/src/resilience/registration/CMakeLists.txt @@ -0,0 +1,4 @@ +target_sources(resilience PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/Registration.cpp + ${CMAKE_CURRENT_LIST_DIR}/Custom.cpp + ) diff --git a/src/resilience/MPIContext.cpp b/src/resilience/registration/Custom.cpp similarity index 69% rename from src/resilience/MPIContext.cpp rename to src/resilience/registration/Custom.cpp index e593bfd..fef91a4 100644 --- a/src/resilience/MPIContext.cpp +++ b/src/resilience/registration/Custom.cpp @@ -38,31 +38,14 @@ * * Questions? Contact Christian R. Trott (crtrott@sandia.gov) */ -#include "MPIContext.hpp" -#ifdef KR_ENABLE_VELOC -#include "veloc/VelocBackend.hpp" -#endif -#include -#include -namespace KokkosResilience { -std::unique_ptr< ContextBase > -make_context( MPI_Comm comm, const std::string &config ) -{ - auto cfg = Config{ config }; - - using fun_type = std::function< std::unique_ptr< ContextBase >() >; - static std::unordered_map< std::string, fun_type > backends = { -#ifdef KR_ENABLE_VELOC - { "veloc", [&](){ return std::make_unique< MPIContext< VeloCMemoryBackend > >( comm, cfg ); } }, - { "veloc-noop", [&](){ return std::make_unique< MPIContext< VeloCRegisterOnlyBackend > >( comm, cfg ); } } -#endif - }; - - auto pos = backends.find( cfg["backend"].as< std::string >() ); - if ( pos == backends.end() ) - return std::unique_ptr< ContextBase >{}; +#include "resilience/registration/Custom.hpp" - return pos->second(); -} +namespace KokkosResilience { + Registration custom_registration( + serializer_t&& s_fun, + deserializer_t&& d_fun, + const std::string label){ + return std::make_shared(std::move(s_fun), std::move(d_fun), label); + } } diff --git a/src/resilience/registration/Custom.hpp b/src/resilience/registration/Custom.hpp new file mode 100644 index 0000000..85c13d7 --- /dev/null +++ b/src/resilience/registration/Custom.hpp @@ -0,0 +1,82 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef _INC_RESILIENCE_REGISTRATION_CUSTOM_HPP +#define _INC_RESILIENCE_REGISTRATION_CUSTOM_HPP + +#include "Registration.hpp" + +namespace KokkosResilience::Detail { + struct CustomRegistration : public RegistrationBase { + CustomRegistration() = delete; + CustomRegistration(serializer_t&& serializer, deserializer_t&& deserializer, const std::string name) : + RegistrationBase(name), + m_serializer(serializer), + m_deserializer(deserializer) {}; + + const serializer_t serializer() const override{ + return m_serializer; + } + + const deserializer_t deserializer() const override{ + return m_deserializer; + } + + const bool is_same_reference(const Registration& other_reg) const override{ + auto other = dynamic_cast(other_reg.get()); + + if(!other){ + //We wouldn't expect this to happen, and it may indicate a hash collision + fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); + return false; + } + + return (&m_serializer == &(other->m_serializer)) && + (&m_deserializer == &(other->m_deserializer)); + } + + private: + const serializer_t m_serializer; + const deserializer_t m_deserializer; + }; +} + +#endif diff --git a/src/resilience/registration/Magistrate.hpp b/src/resilience/registration/Magistrate.hpp new file mode 100644 index 0000000..94234f4 --- /dev/null +++ b/src/resilience/registration/Magistrate.hpp @@ -0,0 +1,131 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_REGISTRATION_MAGISTRATE_HPP +#define INC_KOKKOS_RESILIENCE_REGISTRATION_MAGISTRATE_HPP + +#include "resilience/registration/Registration.hpp" +#include + +#ifdef KR_ENABLE_VT +#include "resilience/util/VTUtil.hpp" +#endif + +namespace KokkosResilience::Detail { + //Registration for some type which Magistrate knows how to checkpoint. + template + < + typename MemberType, + typename... Traits + > + struct MagistrateRegistration : public RegistrationBase { + MagistrateRegistration() = delete; + + MagistrateRegistration(MemberType& member, std::string name) + : RegistrationBase(name), m_member(member) {} + + const serializer_t serializer() const override{ + return [&, this](std::ostream& stream){ + checkpoint::serializeToStream< + Traits... + >(m_member, stream); + return bool(stream); + }; + } + + const deserializer_t deserializer() const override{ + return [&, this](std::istream& stream){ + checkpoint::deserializeInPlaceFromStream< + Traits... + >(stream, &m_member); + return bool(stream); + }; + } + + const bool is_same_reference(const Registration& other_reg) const override{ + auto other = dynamic_cast*>(other_reg.get()); + + if(!other){ + //We wouldn't expect this to happen, and it may indicate a hash collision + fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); + return false; + } + + return &m_member == &other->m_member; + } + + private: + MemberType& m_member; + }; +} + + +namespace KokkosResilience { + template< + typename T, + typename... Traits + > + struct create_registration< + T, + std::tuple, + std::enable_if_t< + checkpoint::SerializableTraits< + T, + checkpoint::StreamPacker<> + >::is_traversable +#ifdef KR_ENABLE_VT + and not Util::VT::is_proxy::value +#endif + >* + > { + using BaseT = Detail::MagistrateRegistration; + std::shared_ptr reg; + + create_registration(ContextBase& ctx, T& member, std::string label) + : reg(std::make_shared(member, label)) {}; + + auto get() && { + return std::move(reg); + } + }; +} + +#endif // INC_RESILIENCE_MAGISTRATE_HPP diff --git a/src/resilience/registration/Registration.cpp b/src/resilience/registration/Registration.cpp new file mode 100644 index 0000000..f0f949d --- /dev/null +++ b/src/resilience/registration/Registration.cpp @@ -0,0 +1,13 @@ +#include "Registration.hpp" +#include +#include //isalnum + +namespace KokkosResilience::Detail { + std::string sanitized_label(std::string label){ + //If character not alphanumeric, can only be an underscore. + for(char& c : label){ + if(!std::isalnum(c)) c = '_'; + } + return label; + } +} diff --git a/src/resilience/registration/Registration.hpp b/src/resilience/registration/Registration.hpp new file mode 100644 index 0000000..5fc5e9b --- /dev/null +++ b/src/resilience/registration/Registration.hpp @@ -0,0 +1,143 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef _INC_RESILIENCE_REGISTRATION_HPP +#define _INC_RESILIENCE_REGISTRATION_HPP + +#include +#include +#include +#include +#include + +namespace KokkosResilience +{ + //Takes a stream as input, returns success flag + using serializer_t = std::function; + using deserializer_t = std::function; + + struct Registration; + + + namespace Detail { + std::string sanitized_label(std::string label); + + struct RegistrationBase { + const std::string name; + + RegistrationBase() = delete; + virtual ~RegistrationBase() = default; + + virtual const serializer_t serializer() const = 0; + virtual const deserializer_t deserializer() const = 0; + virtual const bool is_same_reference(const Registration&) const = 0; + + bool operator==(const RegistrationBase& other) const { + return this->name == other.name; + } + + virtual const size_t hash(){ + const size_t base = 7; + size_t hash = 0; + for(size_t i = 0; i < name.length(); i++){ + hash += static_cast((static_cast(name[i]) * + static_cast(pow(base, i)) + ) % INT_MAX); + } + return static_cast(hash%INT_MAX); + } + + protected: + RegistrationBase(const std::string &member_name) : + name(sanitized_label(member_name)) { } + }; + + + //Helper for explicitly-listing data that a + //checkpoint region should also use. + template + struct RegInfo { + RegInfo(T& member, const std::string label) : member(member), label(label) {}; + T& member; + const std::string label; + }; + } + + + + //A struct convertible to Registration, use as if function returning Registration. + //Generally, register as: create_registration(ContextBase* ctx, T& member, const std::string& label); + //But see various registration headers for any specializations based on member type + template, typename enable = void*> + struct create_registration; + + //Make registration using custom (de)serialize functions + Registration custom_registration(serializer_t&& s_fun, deserializer_t&& d_fun, const std::string label); + + struct Registration : public std::shared_ptr { + template + Registration(std::shared_ptr base) + : std::shared_ptr(std::move(base)) {} + + template + Registration(create_registration reg) + : Registration(std::move(reg).get()) {}; + + const size_t hash() const { + return (*this)->hash(); + } + + bool operator==(const Registration& other) const { + return *(this->get()) == *(other.get()); + } + }; +} //namespace KokkosResilience + +namespace std { + template<> + struct hash{ + size_t operator()(const KokkosResilience::Registration& registration) const { + return registration.hash(); + } + }; +} + +#endif //_INC_RESILIENCE_REGISTRATION_HPP diff --git a/src/resilience/registration/RegistrationHeaders.hpp b/src/resilience/registration/RegistrationHeaders.hpp new file mode 100644 index 0000000..15496e0 --- /dev/null +++ b/src/resilience/registration/RegistrationHeaders.hpp @@ -0,0 +1,19 @@ +#ifndef _INC_RESILIENCE_REGISTRATION_HEADERS_HPP +#define _INC_RESILIENCE_REGISTRATION_HEADERS_HPP + +#include "./Registration.hpp" + +#include "./ViewHolder.hpp" +#include "./Custom.hpp" + +#ifdef KR_ENABLE_MAGISTRATE +#include "./Magistrate.hpp" +#else +#include "./Simple.hpp" +#endif + +#ifdef KR_ENABLE_VT +#include "./VTProxy.hpp" +#endif + +#endif diff --git a/src/resilience/registration/Simple.hpp b/src/resilience/registration/Simple.hpp new file mode 100644 index 0000000..59b8b5e --- /dev/null +++ b/src/resilience/registration/Simple.hpp @@ -0,0 +1,54 @@ +#include "resilience/registration/Registration.hpp" +#include "resilience/view_hooks/ViewHolder.hpp" + +namespace KokkosResilience::Detail { + template + struct SimpleRegistration : public RegistrationBase { + SimpleRegistration() = delete; + SimpleRegistration(MemberType& member, const std::string label) + : RegistrationBase(name), m_member(member) {} + + const serializer_t serializer() const override{ + return [&, this](std::ostream& stream){ + stream.write((const char*)&m_member, sizeof(this->m_member)); + return stream.good(); + }; + } + + const deserializer_t deserializer() const override{ + return [&, this](std::istream& stream){ + stream.read((char*)&m_member, sizeof(m_member)); + return stream.good(); + }; + } + + const bool is_same_reference(const Registration& other_reg) const override{ + auto other = dynamic_cast(other_reg.get()); + + if(!other){ + //We wouldn't expect this to happen, and it may indicate a hash collision + fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); + return false; + } + + return &m_member == &(other->m_member); + } + + private: + MemberType& m_member; + }; +} + +namespace KokkosResilience { + template + struct create_registration { + std::shared_ptr> reg; + + create_registration(ContextBase& ctx, T& member, std::string label) + : reg(member, label) {}; + + auto get() { + return std::move(reg); + } + }; +} diff --git a/src/resilience/registration/VTProxy.hpp b/src/resilience/registration/VTProxy.hpp new file mode 100644 index 0000000..523cf4a --- /dev/null +++ b/src/resilience/registration/VTProxy.hpp @@ -0,0 +1,82 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_REGISTRATION_VTPROXY_HPP +#define INC_KOKKOS_RESILIENCE_REGISTRATION_VTPROXY_HPP + +#include + +#include + +#include "resilience/registration/Registration.hpp" +#include "resilience/context/vt/VTContext.hpp" +#include "resilience/util/VTUtil.hpp" + +namespace KokkosResilience { + template + struct create_registration, typename Util::VT::is_proxy::type>{ + std::shared_ptr reg; + + create_registration(ContextBase& context, T& proxy, std::string label = ""){ + using namespace Context::VT; + + label = proxy_label(proxy); + + auto vtCtx = dynamic_cast(&context); + if(vtCtx){ + //VTContext handles checkpointing the actual proxy, just register a small metadata member. + auto& proxy_holder = vtCtx->get_holder(proxy); + reg = std::make_shared> + (proxy_holder, label); + + //If deregistering, vtCtx needs help going from registration to ProxyID + vtCtx->add_reg_mapping(reg->hash(), proxy); + } else { + //Register the full proxy, making sure to include CheckpointTrait + reg = std::make_shared>(proxy, label); + } + } + + auto get(){return reg;} + }; +} + +#endif diff --git a/src/resilience/registration/ViewHolder.hpp b/src/resilience/registration/ViewHolder.hpp new file mode 100644 index 0000000..c162963 --- /dev/null +++ b/src/resilience/registration/ViewHolder.hpp @@ -0,0 +1,83 @@ +#ifndef INC_RESILIENCE_REGISTRATION_VIEWHOLDER_HPP +#define INC_RESILIENCE_REGISTRATION_VIEWHOLDER_HPP + +#include "Registration.hpp" +#include "resilience/view_hooks/ViewHolder.hpp" +#include "resilience/context/ContextBase.hpp" + +namespace KokkosResilience::Detail { +struct ViewHolderRegistration : public RegistrationBase { + ViewHolderRegistration() = delete; + + ViewHolderRegistration(ContextBase& ctx, const KokkosResilience::ViewHolder& view) : + RegistrationBase(view->label()), m_view(view), m_ctx(ctx) {}; + + const serializer_t serializer() const override{ + return [&, this](std::ostream& stream){ + size_t buffer_size = need_buffer ? m_view->data_type_size()*m_view->span() : 0; + char* buf = m_ctx.get_buffer(buffer_size); + + m_view->serialize(stream, buf); + return stream.good(); + }; + } + + const deserializer_t deserializer() const override{ + return [&, this](std::istream& stream){ + size_t buffer_size = need_buffer ? m_view->data_type_size()*m_view->span() : 0; + char* buf = m_ctx.get_buffer(buffer_size); + + m_view->deserialize(stream, buf); + return stream.good(); + }; + } + + const bool is_same_reference(const Registration& other_reg) const override{ + auto other = dynamic_cast(other_reg.get()); + + if(!other){ + //We wouldn't expect this to happen, and it may indicate a hash collision + fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); + return false; + } + + //Handle subviews! We want to checkpoint the largest view/subview, so report that the other is + //the same reference if they're a subset of me. + // + //TODO: This currently assumes the two views are equal or subviews (ie no name collisions), + // and that a larger data() pointer implies a subview (ie we can deal well with subviews of + // subviews, but not two different subviews of the same view). Does Kokkos expose anything + // that can help with this? + return m_view->data() <= other->m_view->data(); + } + +private: + const KokkosResilience::ViewHolder m_view; + + const bool need_buffer = + #ifdef KR_ENABLE_MAGISTRATE + false; + #else + !(m_view->span_is_contiguous() && m_view->is_host_space()); + #endif + + ContextBase& m_ctx; +}; +} + +namespace KokkosResilience { + template //Unused + struct create_registration{ + using RegT = Detail::ViewHolderRegistration; + std::shared_ptr reg; + + create_registration(ContextBase& ctx, const KokkosResilience::ViewHolder& view, std::string unused = "") + : reg(std::make_shared(ctx, view)) {}; + + auto get() { + return std::move(reg); + } + }; +} + +#endif diff --git a/src/resilience/stdfile/StdFileBackend.cpp b/src/resilience/stdfile/StdFileBackend.cpp deleted file mode 100644 index 6a90474..0000000 --- a/src/resilience/stdfile/StdFileBackend.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * - * Kokkos v. 3.0 - * Copyright (2020) National Technology & Engineering - * Solutions of Sandia, LLC (NTESS). - * - * Under the terms of Contract DE-NA0003525 with NTESS, - * the U.S. Government retains certain rights in this software. - * - * Kokkos is licensed under 3-clause BSD terms of use: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the Corporation nor the names of the - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Questions? Contact Christian R. Trott (crtrott@sandia.gov) - */ -#include "StdFileBackend.hpp" - -#include -#include -#include - -#include - -#include "../AutomaticCheckpoint.hpp" - -#ifdef KR_ENABLE_TRACING -#include "../util/Trace.hpp" -#endif - -namespace KokkosResilience { - -namespace detail { - -std::string versionless_filename(std::string const &filename, std::string const &label) { - return filename + "." + label; -} - -std::string full_filename(std::string const &filename, std::string const &label, - int version) { - return versionless_filename(filename, label) + "." + std::to_string(version); -} -} // namespace detail - -StdFileBackend::StdFileBackend(StdFileContext &ctx, - std::string const &filename) - : m_filename(filename), m_context(ctx) {} - -StdFileBackend::~StdFileBackend() = default; - -void StdFileBackend::checkpoint( - const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views) { - try { - std::string filename = detail::full_filename(m_filename, label, version); - std::ofstream file(filename, std::ios::binary); - -#ifdef KR_ENABLE_TRACING - auto write_trace = - Util::begin_trace>(m_context, "write"); -#endif - for (auto &&v : views) { - char *bytes = static_cast(v->data()); - std::size_t len = v->span() * v->data_type_size(); - - file.write(bytes, len); - } -#ifdef KR_ENABLE_TRACING - write_trace.end(); -#endif - } catch (...) { - } -} - -bool StdFileBackend::restart_available(const std::string &label, int version) { - std::string filename = detail::full_filename(m_filename, label, version); - return std::filesystem::exists(filename); -} - -int StdFileBackend::latest_version(const std::string &label) const noexcept { - int result = -1; - std::string filename = detail::versionless_filename(m_filename, label); - std::filesystem::path dir(filename); - - filename = dir.filename().string(); - - dir = std::filesystem::absolute(dir).parent_path(); - - for(auto &entry : std::filesystem::directory_iterator(dir)){ - if (!std::filesystem::is_regular_file(entry)) { - continue; - } - if(filename == entry.path().filename().stem().string()){ - //This is a checkpoint, probably. - try{ - int vers = std::stoi(entry.path().filename().extension().string().substr(1)); - result = std::max(result,vers); - } catch(...) { - //Just not the filename format we expected, could be unrelated. - } - } - } - return result; -} - -void StdFileBackend::restart( - const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views) { - try { - std::string filename = detail::full_filename(m_filename, label, version); - std::ifstream file(filename, std::ios::binary); - -#ifdef KR_ENABLE_TRACING - auto read_trace = - Util::begin_trace>(m_context, "read"); -#endif - for (auto &&v : views) { - char *bytes = static_cast(v->data()); - std::size_t len = v->span() * v->data_type_size(); - - file.read(bytes, len); - } -#ifdef KR_ENABLE_TRACING - read_trace.end(); -#endif - } catch (...) { - } -} -} // namespace KokkosResilience diff --git a/src/resilience/stdio/StdFileSpace.hpp b/src/resilience/stdio/StdFileSpace.hpp index d60f5df..efb67ae 100644 --- a/src/resilience/stdio/StdFileSpace.hpp +++ b/src/resilience/stdio/StdFileSpace.hpp @@ -48,7 +48,7 @@ #include #include -#include "resilience/filesystem/ExternalIOInterface.hpp" +#include "resilience/backend/filesystem/ExternalIOInterface.hpp" #include diff --git a/src/resilience/util/CMakeLists.txt b/src/resilience/util/CMakeLists.txt index dba0ad8..0a5c400 100644 --- a/src/resilience/util/CMakeLists.txt +++ b/src/resilience/util/CMakeLists.txt @@ -1,3 +1,9 @@ target_sources(resilience PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Trace.cpp - ) \ No newline at end of file + ) + +if(KR_ENABLE_VT) + target_sources(resilience PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/VTUtil.cpp + ) +endif() diff --git a/src/resilience/util/Trace.hpp b/src/resilience/util/Trace.hpp index ce5eb53..aedf12b 100644 --- a/src/resilience/util/Trace.hpp +++ b/src/resilience/util/Trace.hpp @@ -52,6 +52,7 @@ #include #include +#include #include "Timer.hpp" @@ -162,7 +163,9 @@ namespace KokkosResilience public: TraceStack() + #ifdef KR_ENABLE_TRACING : m_current( nullptr ) + #endif {} ~TraceStack() = default; @@ -175,6 +178,7 @@ namespace KokkosResilience void push( std::unique_ptr< TraceBase > &&tr ) { + #ifdef KR_ENABLE_TRACING if ( m_current ) { m_current = m_current->add_child( std::move( tr ) ); @@ -182,10 +186,12 @@ namespace KokkosResilience m_traces.emplace_back( std::move( tr ) ); m_current = m_traces.back().get(); } + #endif } void try_pop( TraceBase *tr ) { + #ifdef KR_ENABLE_TRACING if ( !tr ) return; @@ -200,10 +206,12 @@ namespace KokkosResilience m_current->end(); m_current = tr->parent(); } + #endif } std::ostream &write( std::ostream &strm ) { + #ifdef KR_ENABLE_TRACING picojson::object root; picojson::array traces; @@ -220,14 +228,16 @@ namespace KokkosResilience val.serialize( std::ostream_iterator< char >( strm ), true ); + #endif return strm; } private: - + #ifdef KR_ENABLE_TRACING std::vector< std::unique_ptr< TraceBase > > m_traces; TraceBase *m_current; + #endif }; @@ -338,6 +348,10 @@ namespace KokkosResilience return TraceShell(); } #endif + template< template typename TraceTempl, typename Context, typename... Args> + auto begin_trace(Context& ctx, Args&&... args){ + return begin_trace, Context, Args...>(ctx, std::move(args...)); + } template< typename Id > class TimingTrace : public Trace< Id > @@ -396,6 +410,12 @@ namespace KokkosResilience return ret; } + void end() override + { + Kokkos::fence(); + TimingTrace::end(); + } + private: int m_iteration; diff --git a/src/resilience/util/VTUtil.cpp b/src/resilience/util/VTUtil.cpp new file mode 100644 index 0000000..c3dc4ab --- /dev/null +++ b/src/resilience/util/VTUtil.cpp @@ -0,0 +1,62 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#include "VTUtil.hpp" + +namespace KokkosResilience::Util::VT { + void delaySerializeUntil(vt::EpochType epoch){ + if(!vt::sched::ThreadAction::isThreadActive()){ + vt::runSchedulerThrough(epoch); + } else { + auto thread_id = vt::sched::ThreadAction::getActiveThreadID(); + vt::theTerm()->addAction(epoch, [thread_id](){ + vt::theSched()->enqueue([thread_id](){ + vt::theSched()->getThreadManager()->getThread(thread_id)->resume(); + }); + }); + + vt::EpochType parent_epoch = vt::theTerm()->getEpoch(); + vt::theTerm()->popEpoch(parent_epoch); + vt::sched::ThreadAction::suspend(); + vt::theTerm()->pushEpoch(parent_epoch); + } + } +} diff --git a/src/resilience/util/VTUtil.hpp b/src/resilience/util/VTUtil.hpp new file mode 100644 index 0000000..c4e5edb --- /dev/null +++ b/src/resilience/util/VTUtil.hpp @@ -0,0 +1,303 @@ +/* + * + * Kokkos v. 3.0 + * Copyright (2020) National Technology & Engineering + * Solutions of Sandia, LLC (NTESS). + * + * Under the terms of Contract DE-NA0003525 with NTESS, + * the U.S. Government retains certain rights in this software. + * + * Kokkos is licensed under 3-clause BSD terms of use: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Corporation nor the names of the + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Questions? Contact Christian R. Trott (crtrott@sandia.gov) + */ + +#ifndef INC_KOKKOS_RESILIENCE_UTIL_VT_HPP +#define INC_KOKKOS_RESILIENCE_UTIL_VT_HPP + +#include +#include +#include + +namespace KokkosResilience::Util::VT { + template + using VTCol = vt::vrt::collection::CollectionProxy; + template + using VTColElm = vt::vrt::collection::VrtElmProxy; + template + using VTObj = vt::objgroup::proxy::Proxy; + template + using VTObjElm = vt::objgroup::proxy::ProxyElm; + + //Get the type of the actual elements referenced by the proxy. + template + struct _elm_type; + + template + struct _elm_type> { + using type = T; + }; + template + struct _elm_type>{ + using type = T; + }; + template + struct _elm_type>{ + using type = T; + }; + template + struct _elm_type>{ + using type = T; + }; + + template + struct elm_type : public _elm_type::type> {}; + + + //Any collection or its elements + template + struct _is_col { static constexpr bool value = false; }; + + template + struct _is_col, as> { + using type = as; + static constexpr bool value = true; + }; + + template + struct _is_col, as> : public _is_col, as> {}; + + template + struct is_col : public _is_col::type, as> {}; + + + //Any objgroup or its elements + template + struct _is_obj { static constexpr bool value = false; }; + + template + struct _is_obj, as> { + using type = as; + static constexpr bool value = true; + }; + + template + struct _is_obj, as> : public _is_obj, as> {}; + + template + struct is_obj : public _is_obj::type, as> {}; + + + //Element of any objgroup/collection + template + struct _is_elm { static constexpr bool value = false; }; + + template + struct _is_elm, as> { + using type = as; + static constexpr bool value = true; + }; + + template + struct _is_elm, as> { + using type = as; + static constexpr bool value = true; + }; + + template + struct is_elm : public _is_elm::type, as> {}; + + + //Any objgroup/collection and their elements + template + struct is_proxy { static constexpr bool value = false; }; + + template + struct is_proxy::value or is_obj::value, void*> + > { + using type = as; + static constexpr bool value = true; + }; + + + struct ProxyID { + template< + typename ProxyT, + typename enable = typename is_proxy::type + > + ProxyID(ProxyT&& proxy) : + proxy_bits(get_proxy_bits(std::forward(proxy))), + index_bits(get_index_bits(std::forward(proxy))) { }; + + ProxyID() = default; + + bool operator==(const ProxyID& other) const { + return proxy_bits == other.proxy_bits && index_bits == other.index_bits; + } + + bool is_element() const { + return index_bits != uint64_t(-1); + } + + template + void serialize(SerializerT& s){ + s | proxy_bits | index_bits; + } + + uint64_t proxy_bits; + uint64_t index_bits; + + //The indexless collection/objgroup from + //an element within it. + ProxyID get_group_id(){ + ProxyID group = *this; + group.index_bits = -1; + return group; + } + + private: + template + uint64_t get_proxy_bits(ProxyT&& proxy){ + if constexpr(is_col::value and is_elm::value){ + return proxy.getCollectionProxy(); + } else { + return proxy.getProxy(); + } + } + + template + uint64_t get_index_bits(ProxyT&& proxy){ + if constexpr(not is_elm::value){ + return -1; + } else if constexpr(is_col::value){ + return proxy.getElementProxy().getIndex().uniqueBits(); + } else { + return proxy.getNode(); + } + } + }; +} + +namespace std { + //Hash as if it were just a tuple. + template<> + struct hash { + size_t operator()(const KokkosResilience::Util::VT::ProxyID& id) const { + return hash< std::tuple >()( + make_tuple(id.proxy_bits, id.index_bits) + ); + } + }; +} + +namespace KokkosResilience::Util::VT { + + template::type> + std::string proxy_label(ProxyT proxy, const ProxyID& id){ + std::string label; + + if constexpr (is_col::value) { + label = vt::theCollection()->getLabel(id.proxy_bits); + if constexpr (is_elm::value) { + label += proxy.getIndex().toString(); + } + } else { + if constexpr (is_elm::value) { + label = vt::theObjGroup()->getLabel(vt::theObjGroup()->proxyGroup(proxy)); + label += "[" + std::to_string(proxy.getNode()) + "]"; + } else { + label = vt::theObjGroup()->getLabel(proxy); + } + } + return label; + } + + template + inline std::string proxy_label(ProxyT proxy){ + return proxy_label(proxy, proxy); + } + + template + inline bool is_local(ProxyT&& proxy){ + if constexpr(not is_elm::value){ + return false; + } else if constexpr(is_col::value){ + return proxy.tryGetLocalPtr() != nullptr; + } else { + return proxy.getNode() == vt::theContext()->getNode(); + } + } + + //Send if proxy is an element, else broadcast + template + inline void msg(ProxyT proxy, MsgT& msg){ + if constexpr(is_elm::value) { + proxy.template send(msg); + } else { + proxy.template broadcast(msg); + } + } + + //Returns a proxy guaranteed to be a Collection/ObjGroup + template + auto deindex(ProxyT proxy){ + if constexpr(not is_elm::value){ + return proxy; + } else if constexpr(is_col::value){ + using ObjT = typename elm_type::type; + return VTCol(proxy.getCollectionProxy()); + } else { + return vt::theObjGroup()->proxyGroup(proxy); + } + } + + //Returns proxy belonging to same group but with the given index + template + auto reindex(ProxyT proxy, uint64_t index) { + if constexpr(is_col::value){ + //Collections indexed by custom type, whose storage is type-defined + using IndexT = typename elm_type::type::IndexType; + IndexT typed_index = IndexT::uniqueBitsToIndex(index); + + return deindex(proxy)[typed_index]; + } else { + //ObjGroups indexed by rank, which is directly stored as uint64_t + return deindex(proxy)[index]; + } + } + + + //Optimizes for case of being called within a serialize user thread. + void delaySerializeUntil(vt::EpochType epoch); +} + + +#endif diff --git a/src/resilience/veloc/VelocBackend.cpp b/src/resilience/veloc/VelocBackend.cpp deleted file mode 100644 index 30d87ea..0000000 --- a/src/resilience/veloc/VelocBackend.cpp +++ /dev/null @@ -1,412 +0,0 @@ -/* - * - * Kokkos v. 3.0 - * Copyright (2020) National Technology & Engineering - * Solutions of Sandia, LLC (NTESS). - * - * Under the terms of Contract DE-NA0003525 with NTESS, - * the U.S. Government retains certain rights in this software. - * - * Kokkos is licensed under 3-clause BSD terms of use: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the Corporation nor the names of the - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Questions? Contact Christian R. Trott (crtrott@sandia.gov) - */ -#include "VelocBackend.hpp" - -#include -#include -#include -#include - -#include "../MPIContext.hpp" -#include "../AutomaticCheckpoint.hpp" - -#ifdef KR_ENABLE_TRACING - #include "../util/Trace.hpp" -#endif - -#define VELOC_SAFE_CALL( call ) KokkosResilience::veloc_internal_safe_call( call, #call, __FILE__, __LINE__ ) - -namespace KokkosResilience -{ - namespace - { - void veloc_internal_error_throw( int e, const char *name, const char *file, int line = 0 ) - { - std::ostringstream out; - out << name << " error: VELOC operation failed"; - if ( file ) - { - out << " " << file << ":" << line; - } - - // TODO: implement exception class - //Kokkos::Impl::throw_runtime_exception( out.str() ); - } - - inline void veloc_internal_safe_call( int e, const char *name, const char *file, int line = 0 ) - { - if ( VELOC_SUCCESS != e ) - veloc_internal_error_throw( e, name, file, line ); - } - } - - VeloCMemoryBackend::VeloCMemoryBackend(ContextBase &ctx, MPI_Comm mpi_comm) - : m_context(&ctx), m_last_id(0) { - const auto &vconf = m_context->config()["backends"]["veloc"]["config"].as< std::string >(); - VELOC_SAFE_CALL( VELOC_Init( mpi_comm, vconf.c_str() ) ); - } - - VeloCMemoryBackend::~VeloCMemoryBackend() - { - VELOC_Checkpoint_wait(); - VELOC_Finalize( false ); - } - - void VeloCMemoryBackend::checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &_views ) - { - bool status = true; - - // Check if we need to copy any views to backing store - for ( auto &&view : _views ) - { - std::string label = get_canonical_label( view->label() ); - - if ( !view->span_is_contiguous() || !view->is_host_space() ) - { - auto pos = m_registry.find( label ); - if ( pos != m_registry.end()) - { - view->deep_copy_to_buffer( pos->second.buff.data() ); - assert( pos->second.buff.size() == view->data_type_size() * view->span() ); - } - } - } - - VELOC_SAFE_CALL( VELOC_Checkpoint_wait() ); - - VELOC_SAFE_CALL( VELOC_Checkpoint_begin( label.c_str(), version ) ); - - VELOC_SAFE_CALL( VELOC_Checkpoint_mem() ); - - VELOC_SAFE_CALL( VELOC_Checkpoint_end( status )); - - m_latest_version[label] = version; - } - - bool - VeloCMemoryBackend::restart_available( const std::string &label, int version ) - { - // res is < 0 if no versions available, else it is the latest version - return version == latest_version( label ); - } - - int - VeloCMemoryBackend::latest_version( const std::string &label ) const noexcept - { - auto lab = get_canonical_label( label ); - auto latest_iter = m_latest_version.find( lab ); - if ( latest_iter == m_latest_version.end() ) - { - auto test = VELOC_Restart_test(lab.c_str(), 0); - m_latest_version[lab] = test; - return test; - } else { - return latest_iter->second; - } - } - - void - VeloCMemoryBackend::restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &_views ) - { - auto lab = get_canonical_label( label ); - VELOC_SAFE_CALL( VELOC_Restart_begin( lab.c_str(), version )); - - bool status = true; - - VELOC_SAFE_CALL( VELOC_Recover_mem() ); - - VELOC_SAFE_CALL( VELOC_Restart_end( status ) ); - - // Check if we need to copy any views from the backing store back to the view - for ( auto &&view : _views ) - { - auto vl = get_canonical_label( view->label() ); - if ( !view->span_is_contiguous() || !view->is_host_space() ) - { - auto pos = m_registry.find( vl ); - if ( pos != m_registry.end() ) - { - assert( pos->second.buff.size() == view->data_type_size() * view->span() ); - view->deep_copy_from_buffer( pos->second.buff.data() ); - } - } - } - } - - void - VeloCMemoryBackend::reset() - { - for ( auto &&vr : m_registry ) - { - VELOC_Mem_unprotect( vr.second.id ); - } - - m_registry.clear(); - - m_latest_version.clear(); - m_alias_map.clear(); - } - - void - VeloCMemoryBackend::register_hashes( const std::vector< KokkosResilience::ViewHolder > &views, - const std::vector< Detail::CrefImpl > &crefs ) - { - // Clear protected bits - for ( auto &&p : m_registry ) - { - p.second.protect = false; - } - - for ( auto &&view : views ) - { - if ( !view->data() ) // uninitialized view - continue; - - std::string label = get_canonical_label( view->label() ); - auto iter = m_registry.find( label ); - - // Attempt to find the view in our registry - if ( iter == m_registry.end() ) - { - // Calculate id using hash of view label - int id = ++m_last_id; // Prefix since we will consider id 0 to be no-id - iter = m_registry.emplace( std::piecewise_construct, - std::forward_as_tuple( label ), - std::forward_as_tuple( id ) ).first; - iter->second.element_size = view->data_type_size(); - iter->second.size = view->span(); - - if ( !view->is_host_space() || !view->span_is_contiguous() ) - { - // Can't reference memory directly, allocate memory for a watch buffer - iter->second.buff.assign( iter->second.size * iter->second.element_size, 0x00 ); - iter->second.ptr = iter->second.buff.data(); - } else { - iter->second.ptr = view->data(); - } - } - - // iter now pointing to our entry - iter->second.protect = true; - } - - // Register crefs - for ( auto &&cref : crefs ) - { - if ( !cref.ptr ) // uninitialized view - continue; - // If we haven't already register, register with VeloC - auto iter = m_registry.find( cref.name ); - if ( iter == m_registry.end()) - { - int id = ++m_last_id; // Prefix since we will consider id 0 to be no-id - iter = m_registry.emplace( std::piecewise_construct, - std::forward_as_tuple( cref.name ), - std::forward_as_tuple( id ) ).first; - - iter->second.ptr = cref.ptr; - iter->second.size = cref.num; - iter->second.element_size = cref.sz; - } - - iter->second.protect = true; - } - - // Register everything protected, unregister anything unprotected - for ( auto &&p : m_registry ) - { - if ( p.second.protect ) - { - if ( !p.second.registered ) - { - std::cout << "Protecting memory id " << p.second.id << " with label " << p.first << '\n'; - VELOC_SAFE_CALL( VELOC_Mem_protect( p.second.id, p.second.ptr, p.second.size, p.second.element_size ) ); - p.second.registered = true; - } - } else { //deregister - if ( p.second.registered ) - { - std::cout << "Unprotecting memory id " << p.second.id << " with label " << p.first << '\n'; - VELOC_Mem_unprotect( p.second.id ); - p.second.registered = false; - } - } - } - } - - void - VeloCMemoryBackend::register_alias( const std::string &original, const std::string &alias ) - { - m_alias_map[alias] = original; - } - - std::string - VeloCMemoryBackend::get_canonical_label( const std::string &_label ) const noexcept - { - // Possible the view has an alias. If so, make sure that is registered instead - auto pos = m_alias_map.find( _label ); - if ( m_alias_map.end() != pos ) - { - return pos->second; - } else { - return _label; - } - } - - void - VeloCRegisterOnlyBackend::checkpoint( const std::string &label, int version, const std::vector &views ) - { - // No-op, don't do anything - } - - void - VeloCRegisterOnlyBackend::restart(const std::string &label, int version, const std::vector &views) - { - // No-op, don't do anything - } - - VeloCFileBackend::VeloCFileBackend(MPIContext &, - MPI_Comm mpi_comm, - const std::string &veloc_config) { - VELOC_SAFE_CALL( VELOC_Init( mpi_comm, veloc_config.c_str())); - } - - VeloCFileBackend::~VeloCFileBackend() - { - VELOC_Finalize( false ); - } - - void - VeloCFileBackend::checkpoint( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ) - { - // Wait for previous checkpoint to finish - VELOC_SAFE_CALL( VELOC_Checkpoint_wait()); - - // Start new checkpoint - VELOC_SAFE_CALL( VELOC_Checkpoint_begin( label.c_str(), version )); - - char veloc_file_name[VELOC_MAX_NAME]; - - bool status = true; - try - { - VELOC_SAFE_CALL( VELOC_Route_file( veloc_file_name, veloc_file_name ) ); - - std::string fname( veloc_file_name ); - std::ofstream vfile( fname, std::ios::binary ); - -#ifdef KR_ENABLE_TRACING - auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, "write" ); -#endif - for ( auto &&v : views ) - { - char *bytes = static_cast< char * >( v->data()); - std::size_t len = v->span() * v->data_type_size(); - - vfile.write( bytes, len ); - } -#ifdef KR_ENABLE_TRACING - write_trace.end(); -#endif - } - catch ( ... ) - { - status = false; - } - - VELOC_SAFE_CALL( VELOC_Checkpoint_end( status )); - } - - bool - VeloCFileBackend::restart_available( const std::string &label, int version ) - { - int latest = VELOC_Restart_test( label.c_str(), 0 ); - - // res is < 0 if no versions available, else it is the latest version - return version <= latest; - } - - int VeloCFileBackend::latest_version( const std::string &label ) const noexcept - { - return VELOC_Restart_test( label.c_str(), 0 ); - } - - void VeloCFileBackend::restart( const std::string &label, int version, - const std::vector< KokkosResilience::ViewHolder > &views ) - { - VELOC_SAFE_CALL( VELOC_Restart_begin( label.c_str(), version )); - - char veloc_file_name[VELOC_MAX_NAME]; - - bool status = true; - try - { - VELOC_SAFE_CALL( VELOC_Route_file( veloc_file_name, veloc_file_name ) ); - printf( "restore file name: %s\n", veloc_file_name ); - - std::string fname( veloc_file_name ); - std::ifstream vfile( fname, std::ios::binary ); - -#ifdef KR_ENABLE_TRACING - auto read_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, "read" ); -#endif - for ( auto &&v : views ) - { - char *bytes = static_cast< char * >( v->data()); - std::size_t len = v->span() * v->data_type_size(); - - vfile.read( bytes, len ); - } -#ifdef KR_ENABLE_TRACING - read_trace.end(); -#endif - } - catch ( ... ) - { - status = false; - } - - VELOC_SAFE_CALL( VELOC_Restart_end( status )); - } -} diff --git a/src/resilience/view_hooks/ViewHolder.hpp b/src/resilience/view_hooks/ViewHolder.hpp index 76e6fda..d1c820e 100644 --- a/src/resilience/view_hooks/ViewHolder.hpp +++ b/src/resilience/view_hooks/ViewHolder.hpp @@ -47,6 +47,10 @@ #include #include +#ifdef KR_ENABLE_MAGISTRATE +#include "checkpoint/checkpoint.h" +#endif + namespace KokkosResilience { namespace Impl { @@ -105,6 +109,7 @@ class ConstViewHolderImplBase { virtual void deep_copy_to_buffer(unsigned char *buff) = 0; virtual ConstViewHolderImplBase *clone() const = 0; + virtual void serialize(std::ostream& stream, char *buf = nullptr) = 0; protected: ConstViewHolderImplBase(std::size_t span, bool span_is_contiguous, @@ -141,6 +146,8 @@ class ViewHolderImplBase { virtual void deep_copy_to_buffer(unsigned char *buff) = 0; virtual void deep_copy_from_buffer(const unsigned char *buff) = 0; virtual ViewHolderImplBase *clone() const = 0; + virtual void serialize(std::ostream& stream, char *buf = nullptr) = 0; + virtual void deserialize(std::istream& stream, char *buf = nullptr) = 0; protected: ViewHolderImplBase(std::size_t span, bool span_is_contiguous, void *data, @@ -153,10 +160,10 @@ class ViewHolderImplBase { m_data_type_size(data_type_size), m_is_host_space(is_host_space) {} - private: size_t m_span = 0; bool m_span_is_contiguous = false; void *m_data = nullptr; + private: std::string m_label; size_t m_data_type_size = 0; bool m_is_host_space = false; @@ -220,6 +227,36 @@ class ViewHolderImpl : public ViewHolderImplBase { buff); } +#ifdef KR_ENABLE_MAGISTRATE + template>* = nullptr> + void serialize(SerT& s){ + checkpoint::serializeContentsOnly(s, m_view); + } + void serialize(std::ostream& stream, char *buf = nullptr) override { + checkpoint::serializeToStream(*this, stream); + } + void deserialize(std::istream& stream, char *buf = nullptr) override { + checkpoint::deserializeInPlaceFromStream(stream, this); + } +#else + void serialize(std::ostream& stream, char *buf) override { + if(!span_is_contiguous() || !is_host_space()){ + deep_copy_to_buffer((unsigned char*)buf); + stream.write((const char*)buf, data_type_size() * span()); + } else { + stream.write((const char*)data(), data_type_size() * span()); + } + } + void deserialize(std::istream& stream, char *buf) override{ + if(!span_is_contiguous() || !is_host_space()){ + stream.read(buf, data_type_size() * span()); + deep_copy_from_buffer((const unsigned char*)buf); + } else { + stream.read(static_cast< char * >( data() ), data_type_size() * span()); + } + } +#endif + ViewHolderImpl *clone() const override { return new ViewHolderImpl(m_view); } private: @@ -249,6 +286,15 @@ class ViewHolderImpl::copy_to_unmanaged(m_view, buff); } + void serialize(std::ostream& stream, char *buf = nullptr) override { + if(!span_is_contiguous() || !is_host_space()){ + deep_copy_to_buffer((unsigned char *)buf); + stream.write((const char*)buf, data_type_size() * span()); + } else { + stream.write((const char*)data(), data_type_size() * span()); + } + } + ViewHolderImpl *clone() const override { return new ViewHolderImpl(m_view); } private: diff --git a/tests/TestDynamicViewHooks.cpp b/tests/TestDynamicViewHooks.cpp index 18c786b..edf393d 100644 --- a/tests/TestDynamicViewHooks.cpp +++ b/tests/TestDynamicViewHooks.cpp @@ -193,6 +193,9 @@ TYPED_TEST( TestDynamicViewHooks, TestDynamicViewHooksMoveAssign ) KokkosResilience::DynamicViewHooks::move_assignment_set.set_callback( [&holder](const KokkosResilience::ViewHolder &vh) mutable { + // In both cases here, holder is uninitialized + EXPECT_EQ(holder.data(), nullptr); + EXPECT_NE(vh.data(), nullptr); holder = vh; }); @@ -200,6 +203,9 @@ TYPED_TEST( TestDynamicViewHooks, TestDynamicViewHooksMoveAssign ) .set_const_callback( [&const_holder]( const KokkosResilience::ConstViewHolder &vh) mutable { + // In both cases here, const_holder is uninitialized + EXPECT_EQ(const_holder.data(), nullptr); + EXPECT_NE(vh.data(), nullptr); const_holder = vh; }); @@ -207,16 +213,20 @@ TYPED_TEST( TestDynamicViewHooks, TestDynamicViewHooksMoveAssign ) void *cmp = testa.data(); test_view_type testb; + const_test_view_type testa_const( + testa); // Won't trigger the callback since this is not a copy + // constructor call + // Trigger the non-const move assign callback testb = std::move(testa); EXPECT_EQ(cmp, holder.data()); EXPECT_EQ(const_holder.data(), nullptr); - const_test_view_type testa_const( - testa); // Won't trigger the callback since this is not a copy - // constructor call + const_test_view_type testb_const; // Trigger the const move assign callback testb_const = std::move(testa_const); EXPECT_EQ(cmp, const_holder.data()); + + KokkosResilience::DynamicViewHooks::reset(); } diff --git a/tests/TestLambdaCapture.cpp b/tests/TestLambdaCapture.cpp index a098b90..a1fdbd6 100644 --- a/tests/TestLambdaCapture.cpp +++ b/tests/TestLambdaCapture.cpp @@ -53,7 +53,6 @@ auto get_view_list( F &&_fun ) auto f = _fun; - KokkosResilience::Detail::Cref::check_ref_list = nullptr; KokkosResilience::DynamicViewHooks::copy_constructor_set.reset(); f(); diff --git a/tests/TestStdFileBackend.cpp b/tests/TestStdFileBackend.cpp index e292b29..f3dfbdb 100644 --- a/tests/TestStdFileBackend.cpp +++ b/tests/TestStdFileBackend.cpp @@ -40,10 +40,10 @@ */ #include "TestCommon.hpp" -#include +#include #include -#include -#include +#include +#include #include diff --git a/tests/TestVelocMemoryBackend.cpp b/tests/TestVelocMemoryBackend.cpp index 8f68755..f99503f 100644 --- a/tests/TestVelocMemoryBackend.cpp +++ b/tests/TestVelocMemoryBackend.cpp @@ -40,10 +40,10 @@ */ #include "TestCommon.hpp" -#include +#include #include -#include -#include +#include +#include #include @@ -59,7 +59,7 @@ class TestVelocMemoryBackend : public ::testing::Test template< typename Layout, typename Context > static void test_layout( Context &ctx, std::size_t dimx, std::size_t dimy ) { - ctx.backend().reset(); + ctx.reset(); using memory_space = typename exec_space::memory_space; using view_type = KokkosResilience::View< double **, Layout, memory_space >; @@ -139,7 +139,7 @@ TYPED_TEST( TestVelocMemoryBackend, veloc_mem ) KokkosResilience::Config cfg; cfg["backend"].set( "veloc"s ); cfg["backends"]["veloc"]["config"].set( KR_TEST_DATADIR "/veloc_test.cfg" ); - KokkosResilience::MPIContext< KokkosResilience::VeloCMemoryBackend > ctx( MPI_COMM_WORLD, cfg ); + KokkosResilience::MPIContext ctx( MPI_COMM_WORLD, cfg ); for ( std::size_t dimx = 1; dimx < 5; ++dimx ) {