GridTools · havogt · Sep 25, 2020 · Sep 24, 2020 · Sep 24, 2020 · Sep 24, 2020
diff --git a/include/gridtools/common/hugepage_alloc.hpp b/include/gridtools/common/hugepage_alloc.hpp
@@ -21,6 +21,10 @@ namespace gridtools {
      * reduce cache set conflicts.
      */
     inline void *hugepage_alloc(std::size_t size) {
+#ifdef __cray__
+// see issue https://github.com/GridTools/gridtools/issues/1557
+#warning "hugepage_alloc on Cray might be suboptimal"
+#endif
         static std::atomic<std::size_t> s_offset(64);
         auto offset = s_offset.load(std::memory_order_relaxed);
         auto next_offset = offset;

diff --git a/include/gridtools/stencil/cpu_ifirst/loops.hpp b/include/gridtools/stencil/cpu_ifirst/loops.hpp
@@ -28,13 +28,7 @@ namespace gridtools {
             namespace loops_impl_ {
                 template <class Stage, class Ptr, class Strides>
                 GT_FORCE_INLINE void i_loop(int_t size, Stage stage, Ptr &ptr, Strides const &strides) {
-#ifdef NDEBUG
-// TODO(anstaf & fthaler):
-//   Maybe we have to re-run tests with different combinations of pragmas on different compilers,
-//   the current set of pragmas is at the border of legality for the present code, so maybe we can find a better option.
-#pragma ivdep
 #pragma omp simd
-#endif
                     for (int_t i = 0; i < size; ++i) {
                         using namespace literals;
                         stage(ptr, strides);
@@ -109,7 +103,8 @@ namespace gridtools {
                     int_t i_blocks = info.i_blocks();
                     int_t j_blocks = info.j_blocks();
                     int_t k_size = grid.k_size();
-                    thread_pool::parallel_for_loop(ThreadPool(),
+                    thread_pool::parallel_for_loop(
+                        ThreadPool(),
                         [&](auto i, auto k, auto j) {
                             tuple_util::for_each([block = info.block(i, j, k)](auto &&loop) { loop(block); }, loops);
                         },
@@ -157,7 +152,8 @@ namespace gridtools {
                 template <class ThreadPool, class Grid, class Loops>
                 void run_loops(std::false_type, Grid const &grid, Loops loops) {
                     execinfo info(ThreadPool(), grid);
-                    thread_pool::parallel_for_loop(ThreadPool(),
+                    thread_pool::parallel_for_loop(
+                        ThreadPool(),
                         [&](auto i, auto j) {
                             tuple_util::for_each([block = info.block(i, j)](auto &&loop) { loop(block); }, loops);
                         },

diff --git a/jenkins/envs/daint.sh b/jenkins/envs/daint.sh
@@ -16,7 +16,7 @@ function module() {
 }
 
 module load daint-gpu
-module load cudatoolkit/10.1.105_3.27-7.0.1.1_4.1__ga311ce7
+module load cudatoolkit
 module rm CMake
 module load /users/jenkins/easybuild/daint/haswell/modules/all/CMake/3.14.5
 

diff --git a/jenkins/envs/daint_cray.sh b/jenkins/envs/daint_cray.sh
@@ -11,5 +11,3 @@ export GTCMAKE_CMAKE_CUDA_HOST_COMPILER="$CXX"
 
 export CUDAHOSTCXX="$CXX"
 export CTEST_PARALLEL_LEVEL=1
-export CXXFLAGS='-fno-cray-gpu -fno-cray'
-export CFLAGS='-fno-cray-gpu -fno-cray-mallopt -fno-cray'