Skip to content

Commit

Permalink
Update stdfile backend, finished V1 VTContext
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Whitlock authored and nmm0 committed Jun 8, 2023
1 parent fc4e065 commit 7731779
Show file tree
Hide file tree
Showing 20 changed files with 1,274 additions and 672 deletions.
27 changes: 21 additions & 6 deletions examples/SimpleFileCheckpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,42 @@
#endif

#include <Kokkos_Core.hpp>
#include <resilience/Context.hpp>
#include <resilience/stdfile/StdFileBackend.hpp>
#include <resilience/AutomaticCheckpoint.hpp>
#include <resilience/Resilience.hpp>
#include <mpi.h>

using chkpt_view = Kokkos::Experimental::SubscribableViewHooks<KokkosResilience::DynamicViewHooksSubscriber>;

int
main( int argc, char **argv )
{
MPI_Init( &argc, &argv );

Kokkos::initialize( argc, argv );
{
auto ctx = KokkosResilience::make_context( "checkpoint.data", "config_file.json" );
auto ctx = KokkosResilience::make_context( MPI_COMM_WORLD, "config_file.json" );

int dim0 = 5, dim1 = 5;
auto view = Kokkos::View< double ** >( "test_view", dim0, dim1 );
auto view = Kokkos::View< double **, chkpt_view>( "test_view", dim0, dim1 );

KokkosResilience::checkpoint( *ctx, "test_checkpoint", 0, [view, dim0, dim1]() {
Kokkos::parallel_for( dim0, KOKKOS_LAMBDA( int i ) {
for ( int j = 0; j < dim1; ++j )
view( i, j ) = 3.0;
} );
} );
}, [](int){return true;} );

for(int i = 0; i < dim0; i++){
for(int j = 0; j < dim1; j++){
if(view(i,j) != 3.0) {
fprintf(stderr, "Error: view(%d,%d) = %f, not %f\n", i, j, view(i,j), 3.0);
exit(1);
}
}
}
printf("Success!\n");

}
Kokkos::finalize();

MPI_Finalize();
}
5 changes: 3 additions & 2 deletions examples/config_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
"backend": "stdfile",
"backends": {
"stdfile": {
"config": "file_test.cfg"
"directory": "./stdfile_chkpts/",
"filename_prefix": "simple_"
}
},
"filter": {
"type": "time",
"interval": 10
}
}
}
4 changes: 2 additions & 2 deletions src/resilience/Config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ namespace KokkosResilience

struct ConfigValueError : std::runtime_error
{
ConfigValueError()
: std::runtime_error( "value error" )
ConfigValueError(const std::string desc = "value error")
: std::runtime_error(desc)
{}
};

Expand Down
17 changes: 15 additions & 2 deletions src/resilience/backend/Automatic.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
#include "Automatic.hpp"
#include <stdexcept>

#ifdef KR_ENABLE_VELOC
#include "veloc/VelocBackend.hpp"
#endif

#ifdef KR_ENABLE_STDFILE
#include "stdfile/StdFileBackend.hpp"
#endif

namespace KokkosResilience::Detail {
AutomaticBackend make_backend(ContextBase* ctx){
auto backend = ctx->config()["backend"].as<std::string>();
AutomaticBackend make_backend(ContextBase& ctx){
auto backend = ctx.config()["backend"].as<std::string>();

#ifdef KR_ENABLE_VELOC
if(backend == "veloc"){
return std::make_shared<VeloCMemoryBackend>(ctx);
}
#endif
#ifdef KR_ENABLE_STDFILE
if(backend == "stdfile"){
return std::make_shared<StdFileBackend>(ctx);
}
#endif

throw std::runtime_error(backend + " backend is not available");
}
Expand Down
11 changes: 1 addition & 10 deletions src/resilience/backend/Automatic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,10 @@


#include "AutomaticBase.hpp"

#ifdef KR_ENABLE_VELOC
#include "veloc/VelocBackend.hpp"
#endif

#ifdef KR_ENABLE_STDFILE
#include "stdfile/StdFileBackend.hpp"
#endif

#include "resilience/Config.hpp"

namespace KokkosResilience::Detail {
AutomaticBackend make_backend(ContextBase* ctx);
AutomaticBackend make_backend(ContextBase& ctx);
}

#endif
47 changes: 27 additions & 20 deletions src/resilience/backend/AutomaticBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,42 +45,49 @@
#include <unordered_set>
#include <memory>

namespace KokkosResilience{
namespace KokkosResilience {

//Avoiding cyclic dependency.
class ContextBase;

class AutomaticBackendBase {
public:
explicit AutomaticBackendBase(ContextBase* ctx) : m_context(ctx) {};
explicit AutomaticBackendBase(ContextBase& ctx) : m_context(ctx) {};

virtual ~AutomaticBackendBase() = default;


//All members should be registered before being checkpointed or restarted
virtual void register_member(Registration& member) = 0;

virtual void register_members(std::unordered_set<Registration>& members){
for(auto member : members) register_member(member);
}

virtual bool checkpoint(const std::string& label, int version,
const std::unordered_set<Registration> &members) = 0;
//as_global to checkpoint indepently of PID
virtual bool checkpoint(const std::string& label, int version,
const std::unordered_set<Registration> &members,
bool as_global = false) = 0;


//Get the highest version available which is still less than max
// (or just the highest, if max=0)
virtual int latest_version(const std::string& label, int max = 0, bool as_global = false) const noexcept = 0;

virtual int latest_version(const std::string& label) const noexcept = 0;

virtual bool restart_available(const std::string& label, int version){
return latest_version(label) == version;
};
//Returns failure flag for recovering the specified members.
//as_global to restart independently of PID
virtual bool restart(const std::string& label, int version,
const std::unordered_set<Registration> &members,
bool as_global = false) = 0;

virtual bool restart(const std::string& label, int verison,
const std::unordered_set<Registration> &members) = 0;


//Reset any state, useful for online-recovery.
virtual void reset() = 0;


ContextBase* const m_context;

virtual void register_members(std::unordered_set<Registration>& members){
for(auto member : members) register_member(member);
}

virtual bool restart_available(const std::string& label, int version, bool as_global = false){
return latest_version(label, version+1, as_global) == version;
};

ContextBase& m_context;


//Delete potentially problematic functions for maintaining consistent state
Expand Down
Loading

0 comments on commit 7731779

Please sign in to comment.