Skip to content

Commit

Permalink
#48: fixup after rebase on latest main
Browse files Browse the repository at this point in the history
  • Loading branch information
nmm0 committed Jun 7, 2023
1 parent 4073526 commit eaabc86
Show file tree
Hide file tree
Showing 11 changed files with 88 additions and 55 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ option(KR_ENABLE_VELOC "use VeloC backend for automatic checkpointing" ON)
option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" OFF)

option(KR_ENABLE_MAGISTRATE "use Magistrate for serializing and deserializing" OFF)
option(KR_ENABLE_RESILIENT_EXEC "enable resilient execution spaces" OFF)

include(CMakeDependentOption)

Expand Down
2 changes: 1 addition & 1 deletion examples/benchmark_multiviews.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) {
wtime = MPI_Wtime();
std::size_t i = 1 + KokkosResilience::latest_version(*ctx, "test_kokkos");

while(i < nsteps) {
while(i < nsteps ) {

KokkosResilience::checkpoint(*ctx, "test_kokkos", i, [=]() { // Nic, tell me what should I put for []/

Expand Down
3 changes: 2 additions & 1 deletion src/resilience/AutomaticCheckpoint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <Kokkos_Core.hpp>
#include "view_hooks/ViewHolder.hpp"
#include "view_hooks/DynamicViewHooks.hpp"
#include "registration/ViewHolder.hpp"

#include "context/ContextBase.hpp"

Expand Down Expand Up @@ -117,7 +118,7 @@ namespace KokkosResilience
//Figure out how we should be handling this
bool recover_region = false, checkpoint_region = false;

if(last_region != regions.end() && last_region.label() == label) {
if(last_region.iter() != regions.end() && last_region.label() == label) {
active_region = last_region;
} else {
active_region = regions.insert({label, {}}).first;
Expand Down
4 changes: 2 additions & 2 deletions src/resilience/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ endif()

add_subdirectory(view_hooks)

if (KR_CUDA_EXEC_SPACE)
if (KR_ENABLE_CUDA_EXEC_SPACE)
add_subdirectory(cuda)
endif()

if (KR_OPENMP_EXEC_SPACE)
if (KR_ENABLE_OPENMP_EXEC_SPACE)
add_subdirectory(openMP)
endif()

53 changes: 35 additions & 18 deletions src/resilience/context/ContextBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,26 @@
#if defined(KOKKOS_ENABLE_HPX)
#include <hpx/config.hpp>
#endif

#include <string>
#include <utility>
#include <memory>
#include <functional>
#include <chrono>
#include <Kokkos_Core.hpp>
#include <unordered_map>
#include <set>

#include "resilience/Config.hpp"
#include "resilience/CheckpointFilter.hpp"
#include "resilience/registration/Registration.hpp"
#include "resilience/view_hooks/ViewHolder.hpp"
#include "resilience/util/Trace.hpp"

#ifdef KR_ENABLE_MAGISTRATE
#include "../registration/Magistrate.hpp"
#endif

namespace KokkosResilience
{
class ContextBase
Expand All @@ -69,11 +75,11 @@ namespace KokkosResilience
virtual ~ContextBase() {};

template<typename... Traits, typename RegionFunc, typename FilterFunc, typename... T>
void run(const std::string& label, int iteration, RegionFunc&& fun, FilterFunc&& filter,
void run(const std::string& label, int iteration, RegionFunc&& fun, FilterFunc&& filter,
Detail::RegInfo<T>&... explicit_members);

template<typename... Traits, typename RegionFunc, typename... T>
void run(const std::string& label, int iteration, RegionFunc&& fun,
void run(const std::string& label, int iteration, RegionFunc&& fun,
Detail::RegInfo<T>&... explicit_members) {
run<Traits...>(label, iteration, std::forward<RegionFunc>(fun), default_filter(), explicit_members...);
}
Expand All @@ -93,7 +99,7 @@ namespace KokkosResilience
virtual void register_members(const std::set< KokkosResilience::Registration > &members) {
for(auto& member : members) register_member(member);
};

//Registers to the active region, requires an active region.
template<typename... Traits, typename T>
void register_to_active(T& member, const std::string& label = ""){
Expand All @@ -103,11 +109,11 @@ namespace KokkosResilience
//Registers only if in an active region.
template<typename... Traits, typename T>
bool register_if_active(T& member, const std::string& label){
if(active_region == regions.end()) return false;
if(active_region.iter() == regions.end()) return false;
register_to_active<Traits...>(member, label);
return true;
}

template<typename... Traits, typename T>
void register_globally(T& member, const std::string& label){
global_members.insert(impl_register<Traits...>(member, label));
Expand All @@ -127,30 +133,41 @@ namespace KokkosResilience

//Pointer not guaranteed to remain valid, use immediately & discard.
char* get_buffer(size_t minimum_size);

template<typename... Traits>
void register_to_active(const ViewHolder& view){
Registration registration = create_registration<ViewHolder, std::tuple<Traits...>>(*this, view);
register_member(registration); //Virtual function to whatever inheriting class
active_region.insert(registration);
active_region.insert(registration);
}

protected:
using RegionsMap = std::unordered_map<std::string, std::set<Registration>>;
struct Region : RegionsMap::iterator {
Region(RegionsMap::iterator iter) : RegionsMap::iterator(iter) {};
class Region
{
public:

using map_iterator = RegionsMap::iterator;

Region(map_iterator iter) : m_map_iterator(iter) {};
const std::string& label() const {
return (*this)->first;
return m_map_iterator->first;
}
const std::set<Registration> members() const {
return (*this)->second;
const std::set<Registration> &members() const {
return m_map_iterator->second;
}
std::set<Registration>& members(){
return (*this)->second;
return m_map_iterator->second;
}
void insert(Registration& member){
void insert(const Registration& member){
members().insert(member);
}

auto iter() { return m_map_iterator; }

private:

map_iterator m_map_iterator;
};

//Create Registration and register to implementation
Expand All @@ -161,12 +178,12 @@ namespace KokkosResilience
register_member(registration); //Virtual function to whatever inheriting class
return registration;
}

template<typename... Traits, typename T>
void register_to_active(Detail::RegInfo<T>& info){
register_to_active<Traits...>(info.member, info.label);
}

private:
//Detect views being copied in, register them and any explicitly-listed members.
template<typename... Traits, typename RegionFunc, typename... T>
Expand All @@ -183,10 +200,10 @@ namespace KokkosResilience

RegionsMap regions;
Region active_region = regions.end();

//Performance helper
Region last_region = regions.end();

std::set<Registration> global_members;

public:
Expand Down
24 changes: 17 additions & 7 deletions src/resilience/registration/Magistrate.hpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
#ifndef INC_RESILIENCE_MAGISTRATE_HPP
#define INC_RESILIENCE_MAGISTRATE_HPP

#ifdef KR_ENABLE_MAGISTRATE

#include "resilience/registration/Registration.hpp"
#include "resilience/view_hooks/ViewHolder.hpp"
#include <checkpoint/checkpoint.h>
#include <checkpoint/serializers/stream_serializer.h>

namespace KokkosResilience {
class ContextBase;
}

namespace KokkosResilience::Detail {
struct Checkpoint_Trait {};

//Registration for some type which Magistrate knows how to checkpoint.
template
template
<
typename MemberType,
typename... Traits
>
struct MagistrateRegistration : public RegistrationBase {
MagistrateRegistration() = delete;
MagistrateRegistration(MemberType& member, std::string name)

MagistrateRegistration(MemberType& member, std::string name)
: RegistrationBase(name), m_member(member) {}

const serializer_t serializer() const override{
Expand All @@ -39,13 +47,13 @@ namespace KokkosResilience::Detail {

const bool is_same_reference(const Registration& other_reg) const override{
auto other = dynamic_cast<MagistrateRegistration*>(other_reg.get());

if(!other){
//We wouldn't expect this to happen, and it may indicate a hash collision
fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str());
return false;
}

return &m_member == &other->m_member;
}

Expand All @@ -63,7 +71,7 @@ namespace KokkosResilience {
T,
std::tuple<Traits...>,
std::enable_if_t<
checkpoint::SerializableTraits<T>::is_traversable
checkpoint::SerializableTraits<T, checkpoint::StreamPacker<>>::is_traversable
>*
> {
using BaseT = Detail::MagistrateRegistration<T, Traits...>;
Expand All @@ -72,11 +80,13 @@ namespace KokkosResilience {
create_registration(ContextBase& ctx, T& member, std::string label)
: reg(std::make_shared<BaseT>(member, label)) {};

auto get() {
auto get() && {
return std::move(reg);
}
};
}


#endif

#endif // INC_RESILIENCE_MAGISTRATE_HPP
28 changes: 14 additions & 14 deletions src/resilience/registration/Registration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
#include <memory>

namespace KokkosResilience
{
{
struct Registration;

namespace Detail {
struct RegistrationBase {
typedef std::function<bool (std::ostream &)> serializer_t;
Expand All @@ -24,7 +24,7 @@ namespace KokkosResilience
virtual const serializer_t serializer() const = 0;
virtual const deserializer_t deserializer() const = 0;
virtual const bool is_same_reference(const Registration&) const = 0;

bool operator==(const RegistrationBase& other) const {
return this->name == other.name;
}
Expand All @@ -39,14 +39,14 @@ namespace KokkosResilience
}
return static_cast<size_t>(hash%INT_MAX);
}

protected:
RegistrationBase(const std::string member_name) :
RegistrationBase(const std::string member_name) :
name(member_name) { }
};
//Helper for explicitly-listing data that a


//Helper for explicitly-listing data that a
//checkpoint region should also use.
template<typename T>
struct RegInfo {
Expand All @@ -56,28 +56,28 @@ namespace KokkosResilience
};
}


template<typename T, typename Traits = std::tuple<>, typename enable = void*>
struct create_registration;

struct Registration : public std::shared_ptr<Detail::RegistrationBase> {
using serializer_t = typename Detail::RegistrationBase::serializer_t;
using deserializer_t = typename Detail::RegistrationBase::deserializer_t;

template<typename RegType>
Registration(std::shared_ptr<RegType> base)
Registration(std::shared_ptr<RegType> base)
: std::shared_ptr<Detail::RegistrationBase>(std::move(base)) {}

template<typename... T>
Registration(create_registration<T...> reg)
: Registration(reg.get()) {};
Registration(create_registration<T...> reg)
: Registration(std::move(reg).get()) {};

const size_t hash() const {
return (*this)->hash();
}

bool operator==(const Registration& other){
return this->get() == other.get();
return this->get() == other.get();
}
};
} //namespace KokkosResilience
Expand Down
Loading

0 comments on commit eaabc86

Please sign in to comment.