Skip to content

Commit

Permalink
Merge pull request #64 from kokkos/60-test-resilient-execution-spaces…
Browse files Browse the repository at this point in the history
…-in-CI

test resilient execution spaces in CI
  • Loading branch information
nmm0 authored Jun 20, 2024
2 parents 6581bce + df74739 commit ef48ee5
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 12 deletions.
4 changes: 3 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@
"KR_ENABLE_TESTS": "ON",
"KR_ENABLE_EXAMPLES": "ON",
"KR_ALL_WARNINGS": "ON",
"KR_WARNINGS_AS_ERRORS": "ON"
"KR_WARNINGS_AS_ERRORS": "ON",
"KR_ENABLE_EXEC_SPACES": "ON",
"KR_ENABLE_OPENMP_EXEC_SPACE": "ON"
}
}
],
Expand Down
4 changes: 2 additions & 2 deletions src/resilience/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ endif()

add_subdirectory(view_hooks)

if (KR_CUDA_EXEC_SPACE)
if (KR_ENABLE_CUDA_EXEC_SPACE)
add_subdirectory(cuda)
endif()

if (KR_OPENMP_EXEC_SPACE)
if (KR_ENABLE_OPENMP_EXEC_SPACE)
add_subdirectory(openMP)
endif()

26 changes: 26 additions & 0 deletions src/resilience/Resilience.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,29 @@
*
* Questions? Contact Christian R. Trott ([email protected])
*/

#include "resilience/Resilience.hpp"

namespace KokkosResilience {

void default_unrecoverable_data_corruption_handler(std::size_t) {
Kokkos::abort(
"Resilience majority voting failed because each execution obtained a "
"differing value.");
}

namespace {
unrecoverable_data_corruption_handler g_unrecoverable_data_corruption_handler =
default_unrecoverable_data_corruption_handler;
}

void set_unrecoverable_data_corruption_handler(
unrecoverable_data_corruption_handler handler) {
g_unrecoverable_data_corruption_handler = handler;
}

unrecoverable_data_corruption_handler&
get_unrecoverable_data_corruption_handler() {
return g_unrecoverable_data_corruption_handler;
}
} // namespace KokkosResilience
12 changes: 12 additions & 0 deletions src/resilience/Resilience.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#ifndef INC_RESILIENCE_RESILIENCE_HPP
#define INC_RESILIENCE_RESILIENCE_HPP

#include <functional>

#include <resilience/config/Config.hpp>

#include "Context.hpp"
Expand All @@ -62,5 +64,15 @@
#include "cuda/CudaResParallel.hpp"
#endif

namespace KokkosResilience {
/**
* A function that will be invoked with the total number of retries if the
* runtime encounters an unrecoverable data corruption.
*/
using unrecoverable_data_corruption_handler = std::function<void(std::size_t)>;
void default_unrecoverable_data_corruption_handler(std::size_t retries);
void set_unrecoverable_data_corruption_handler(unrecoverable_data_corruption_handler handler);
unrecoverable_data_corruption_handler &get_unrecoverable_data_corruption_handler();
} // namespace KokkosResilience

#endif // INC_RESILIENCE_RESILIENCE_HPP
7 changes: 5 additions & 2 deletions src/resilience/openMP/OpenMPResParallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#define INC_RESILIENCE_OPENMP_OPENMPRESPARALLEL_HPP

#include <Kokkos_Macros.hpp>
#include "resilience/Resilience.hpp"
#if defined(KOKKOS_ENABLE_OPENMP)

#include <omp.h>
Expand Down Expand Up @@ -108,7 +109,8 @@ class ParallelFor< FunctorType
//! There are some subtleties regarding which views are copied per kernel in the default subscriber
//! See KokkosResilience::ResilienctDuplicatesSubscriber::duplicates_cache for details

int repeats = 5; //! This integer represents the maximum number of attempts to reach consensus allowed.
const int max_repeats = 5;
int repeats = max_repeats; //! This integer represents the maximum number of attempts to reach consensus allowed.
bool success = 0; //! This bool indicates that all views successfully reached a consensus.

while(success==0 && repeats > 0){
Expand Down Expand Up @@ -158,7 +160,8 @@ class ParallelFor< FunctorType

if(success==0 && repeats == 0){
// Abort if 5 repeKokkos::abort(ated tries at executing failed to find acceptable match
Kokkos::abort("Aborted in parallel_for, resilience majority voting failed because each execution obtained a differing value.");
auto &handler = KokkosResilience::get_unrecoverable_data_corruption_handler();
handler(max_repeats);
}

} // execute
Expand Down
17 changes: 10 additions & 7 deletions tests/TestOpenMPResilientExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,15 +168,18 @@ TEST(TestResOpenMP, TestResilientForInsertError)

counter(0) = 0;

::testing::FLAGS_gtest_death_test_style = "threadsafe";
bool failed_recovery = false;
KokkosResilience::set_unrecoverable_data_corruption_handler(
[&failed_recovery](std::size_t) { failed_recovery = true; });

// Assigning each y(i) threadId, should cause a failure in the resilient execution except in single-thread case.
EXPECT_DEATH(
Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) {
y(i) = omp_get_thread_num();
Kokkos::atomic_increment(&counter(0));
});
,"Aborted in parallel_for, resilience majority voting failed because each execution obtained a differing value.");
Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( int i) {
y(i) = counter(0);
Kokkos::atomic_increment(&counter(0));
});
KokkosResilience::clear_duplicates_cache();
KokkosResilience::set_unrecoverable_data_corruption_handler(&KokkosResilience::default_unrecoverable_data_corruption_handler);
ASSERT_TRUE(failed_recovery);
}

// gTest runs parallel_for with resilient Kokkos doubles assignment and atomic counter.
Expand Down

0 comments on commit ef48ee5

Please sign in to comment.