Avoid CUDA code in C2H (#2734)

NVIDIA · Nov 11, 2024 · 79f0ccc · 79f0ccc
1 parent f305eea
commit 79f0ccc
Show file tree

Hide file tree

Showing 115 changed files with 714 additions and 659 deletions.
diff --git a/c2h/catch2_runner.cpp b/c2h/catch2_runner.cpp
@@ -31,4 +31,4 @@
 
 #define CUB_CONFIG_MAIN
 #define CUB_EXCLUDE_CATCH2_HELPER_IMPL
-#include <c2h/catch2_main.cuh>
+#include <c2h/catch2_main.h>
diff --git a/c2h/catch2_runner_helper.inl b/c2h/catch2_runner_helper.inl
@@ -28,7 +28,7 @@
 #pragma once
 
 //! @file
-//! This file includes implementation of CUDA-specific utilities for custom Catch2 main When CMake is configured to
+//! This file includes implementation of CUDA-specific utilities for custom Catch2 main. When CMake is configured to
 //! include all the tests into a single executable, this file is only included into catch2_runner_helper.cu. When CMake
 //! is configured to compile each test as a separate binary, this file is included into each test.
 

diff --git a/c2h/generators.cu b/c2h/generators.cu
@@ -44,12 +44,12 @@
 
 #include <cstdint>
 
-#include <c2h/custom_type.cuh>
-#include <c2h/device_policy.cuh>
-#include <c2h/extended_types.cuh>
-#include <c2h/fill_striped.cuh>
-#include <c2h/generators.cuh>
-#include <c2h/vector.cuh>
+#include <c2h/custom_type.h>
+#include <c2h/device_policy.h>
+#include <c2h/extended_types.h>
+#include <c2h/fill_striped.h>
+#include <c2h/generators.h>
+#include <c2h/vector.h>
 
 #if C2H_HAS_CURAND
 #  include <curand.h>
@@ -118,7 +118,30 @@ private:
   c2h::device_vector<float> m_distribution;
 };
 
-template <typename T, cub::Category = cub::Traits<T>::CATEGORY>
+// TODO(bgruber): modelled after cub::Traits. We should generalize this somewhere into libcu++.
+template <typename T>
+struct is_floating_point : ::cuda::std::is_floating_point<T>
+{};
+#ifdef _CCCL_HAS_NVFP16
+template <>
+struct is_floating_point<__half> : ::cuda::std::true_type
+{};
+#endif // _CCCL_HAS_NVFP16
+#ifdef _CCCL_HAS_NVBF16
+template <>
+struct is_floating_point<__nv_bfloat16> : ::cuda::std::true_type
+{};
+#endif // _CCCL_HAS_NVBF16
+#ifdef __CUDA_FP8_TYPES_EXIST__
+template <>
+struct is_floating_point<__nv_fp8_e4m3> : ::cuda::std::true_type
+{};
+template <>
+struct is_floating_point<__nv_fp8_e5m2> : ::cuda::std::true_type
+{};
+#endif // __CUDA_FP8_TYPES_EXIST__
+
+template <typename T, bool = is_floating_point<T>::value>
 struct random_to_item_t
 {
   float m_min;
@@ -136,7 +159,7 @@ struct random_to_item_t
 };
 
 template <typename T>
-struct random_to_item_t<T, cub::FLOATING_POINT>
+struct random_to_item_t<T, true>
 {
   using storage_t = ::cuda::std::_If<(sizeof(T) > 4), double, float>;
   storage_t m_min;
@@ -182,6 +205,7 @@ RANDOM_TO_VEC_ITEM_SPEC(0, x);
 RANDOM_TO_VEC_ITEM_SPEC(1, y);
 RANDOM_TO_VEC_ITEM_SPEC(2, z);
 RANDOM_TO_VEC_ITEM_SPEC(3, w);
+#undef RANDOM_TO_VEC_ITEM_SPEC
 
 generator_t::generator_t()
 {
@@ -413,17 +437,37 @@ void init_key_segments(const c2h::device_vector<OffsetT>& segment_offsets, KeyT*
   auto d_range_dsts  = thrust::make_transform_iterator(d_offsets, dst_transform_op);
   auto d_range_sizes = thrust::make_transform_iterator(iota, offset_to_size_t<OffsetT>{d_offsets});
 
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
   std::uint8_t* d_temp_storage   = nullptr;
   std::size_t temp_storage_bytes = 0;
+  // TODO(bgruber): replace by a non-CUB implementation
   cub::DeviceCopy::Batched(
     d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);
 
   c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
+  // TODO(bgruber): replace by a non-CUB implementation
   cub::DeviceCopy::Batched(
     d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);
   cudaDeviceSynchronize();
+#else // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  static_assert(sizeof(OffsetT) == 0, "Need to implement a non-CUB version of cub::DeviceCopy::Batched");
+  // TODO(bgruber): implement and *test* a non-CUB version, here is a sketch:
+  // thrust::for_each(
+  //   thrust::device,
+  //   thrust::counting_iterator<OffsetT>{0},
+  //   thrust::counting_iterator<OffsetT>{total_segments},
+  //   [&](OffsetT i) {
+  //     const auto value = d_range_srcs[i];
+  //     const auto start = d_range_sizes[i];
+  //     const auto end   = d_range_sizes[i + 1];
+  //     for (auto j = start; j < end; ++j)
+  //     {
+  //       d_range_dsts[j] = value;
+  //     }
+  //   });
+#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 }
 
 template void init_key_segments(
@@ -434,15 +478,15 @@ template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, float* out, std::size_t element_size);
 template void init_key_segments(
   const c2h::device_vector<std::uint32_t>& segment_offsets, custom_type_state_t* out, std::size_t element_size);
-#ifdef TEST_HALF_T
+#ifdef _CCCL_HAS_NVFP16
 template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, half_t* out, std::size_t element_size);
-#endif
+#endif // _CCCL_HAS_NVFP16
 
-#ifdef TEST_BF_T
+#ifdef _CCCL_HAS_NVBF16
 template void
 init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, bfloat16_t* out, std::size_t element_size);
-#endif
+#endif // _CCCL_HAS_NVBF16
 } // namespace detail
 
 template <typename T>
@@ -482,7 +526,6 @@ void gen(modulo_t mod, c2h::device_vector<T>& data)
 }
 
 #define INSTANTIATE_RND(TYPE) template void gen<TYPE>(seed_t, c2h::device_vector<TYPE> & data, TYPE min, TYPE max)
-
 #define INSTANTIATE_MOD(TYPE) template void gen<TYPE>(modulo_t, c2h::device_vector<TYPE> & data)
 
 #define INSTANTIATE(TYPE) \
@@ -509,14 +552,19 @@ INSTANTIATE(double);
 INSTANTIATE(bool);
 INSTANTIATE(char);
 
-#ifdef TEST_HALF_T
+#ifdef _CCCL_HAS_NVFP16
 INSTANTIATE(half_t);
-#endif
+#endif // _CCCL_HAS_NVFP16
 
-#ifdef TEST_BF_T
+#ifdef _CCCL_HAS_NVBF16
 INSTANTIATE(bfloat16_t);
-#endif
+#endif // _CCCL_HAS_NVBF16
+
+#undef INSTANTIATE_RND
+#undef INSTANTIATE_MOD
+#undef INSTANTIATE
 
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 template <typename T, int VecItem>
 struct vec_gen_helper_t;
 
@@ -546,14 +594,14 @@ struct vec_gen_helper_t
   }
 };
 
-#define VEC_SPECIALIZATION(TYPE, SIZE)                                                                     \
-  template <>                                                                                              \
-  void gen<TYPE##SIZE>(seed_t seed, c2h::device_vector<TYPE##SIZE> & data, TYPE##SIZE min, TYPE##SIZE max) \
-  {                                                                                                        \
-    generator_t& generator = generator_t::instance();                                                      \
-    generator.prepare_random_generator(seed, data.size());                                                 \
-    vec_gen_helper_t<TYPE##SIZE, SIZE - 1>::gen(data, min, max);                                           \
-  }
+#  define VEC_SPECIALIZATION(TYPE, SIZE)                                                                     \
+    template <>                                                                                              \
+    void gen<TYPE##SIZE>(seed_t seed, c2h::device_vector<TYPE##SIZE> & data, TYPE##SIZE min, TYPE##SIZE max) \
+    {                                                                                                        \
+      generator_t& generator = generator_t::instance();                                                      \
+      generator.prepare_random_generator(seed, data.size());                                                 \
+      vec_gen_helper_t<TYPE##SIZE, SIZE - 1>::gen(data, min, max);                                           \
+    }
 
 VEC_SPECIALIZATION(char, 2);
 VEC_SPECIALIZATION(char, 3);
@@ -620,19 +668,16 @@ struct vec_gen_t
   }
 };
 
-#define VEC_GEN_MOD_SPECIALIZATION(VEC_TYPE, SCALAR_TYPE)                                                        \
-  template <>                                                                                                    \
-  void gen<VEC_TYPE>(modulo_t mod, c2h::device_vector<VEC_TYPE> & data)                                          \
-  {                                                                                                              \
-    thrust::tabulate(c2h::device_policy, data.begin(), data.end(), vec_gen_t<VEC_TYPE, SCALAR_TYPE>{mod.get()}); \
-  }
+#  define VEC_GEN_MOD_SPECIALIZATION(VEC_TYPE, SCALAR_TYPE)                                                        \
+    template <>                                                                                                    \
+    void gen<VEC_TYPE>(modulo_t mod, c2h::device_vector<VEC_TYPE> & data)                                          \
+    {                                                                                                              \
+      thrust::tabulate(c2h::device_policy, data.begin(), data.end(), vec_gen_t<VEC_TYPE, SCALAR_TYPE>{mod.get()}); \
+    }
 
 VEC_GEN_MOD_SPECIALIZATION(short2, short);
-
 VEC_GEN_MOD_SPECIALIZATION(uchar3, unsigned char);
-
 VEC_GEN_MOD_SPECIALIZATION(ulonglong4, unsigned long long);
-
 VEC_GEN_MOD_SPECIALIZATION(ushort4, unsigned short);
-
+#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 } // namespace c2h
diff --git a/c2h/include/c2h/catch2_main.cuh → c2h/include/c2h/catch2_main.h b/c2h/include/c2h/catch2_main.cuh → c2h/include/c2h/catch2_main.h
@@ -27,6 +27,8 @@
 
 #pragma once
 
+#include <thrust/detail/config/device_system.h>
+
 #include <iostream>
 
 //! @file
@@ -41,16 +43,19 @@
 #include <catch2/catch.hpp>
 
 #if defined(CUB_CONFIG_MAIN)
-#  include <c2h/catch2_runner_helper.cuh>
+#  if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#    include <c2h/catch2_runner_helper.h>
 
-#  if !defined(CUB_EXCLUDE_CATCH2_HELPER_IMPL)
-#    include "catch2_runner_helper.inl"
-#  endif
+#    ifndef CUB_EXCLUDE_CATCH2_HELPER_IMPL
+#      include "catch2_runner_helper.inl"
+#    endif // !CUB_EXCLUDE_CATCH2_HELPER_IMPL
+#  endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
 int main(int argc, char* argv[])
 {
   Catch::Session session;
 
+#  if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
   int device_id{};
 
   // Build a new parser on top of Catch's
@@ -65,6 +70,7 @@ int main(int argc, char* argv[])
   }
 
   set_device(device_id);
+#  endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
   return session.run(argc, argv);
 }
 #endif
diff --git a/c2h/include/c2h/catch2_runner_helper.cuh → c2h/include/c2h/catch2_runner_helper.h b/c2h/include/c2h/catch2_runner_helper.cuh → c2h/include/c2h/catch2_runner_helper.h
diff --git a/c2h/include/c2h/catch2_test_helper.cuh → c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.cuh → c2h/include/c2h/catch2_test_helper.h
@@ -27,29 +27,27 @@
 
 #pragma once
 
-#include <cub/util_compiler.cuh>
+#include <cuda/std/detail/__config>
 
-#include <cuda/std/__cccl/diagnostic.h>
+#include <cuda/std/bit>
+#include <cuda/std/cmath>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 #include <cstdint>
 #include <cstdlib>
 #include <tuple>
 #include <type_traits>
 
 #if __CUDACC_VER_MAJOR__ == 11
-_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variableds
+_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variables
 #endif // nvcc-11
 
-#include <cuda/std/bit>
-#include <cuda/std/cmath>
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
-
-#include <c2h/catch2_main.cuh>
-#include <c2h/device_policy.cuh>
-#include <c2h/test_util_vec.cuh>
-#include <c2h/utility.cuh>
-#include <c2h/vector.cuh>
+#include <c2h/catch2_main.h>
+#include <c2h/device_policy.h>
+#include <c2h/test_util_vec.h>
+#include <c2h/utility.h>
+#include <c2h/vector.h>
 
 #ifndef VAR_IDX
 #  define VAR_IDX 0
@@ -136,10 +134,7 @@ struct bitwise_equal
   template <typename T>
   bool operator()(const T& a, const T& b) const
   {
-    using bits_t  = typename cub::Traits<T>::UnsignedBits;
-    bits_t a_bits = ::cuda::std::bit_cast<bits_t>(a);
-    bits_t b_bits = ::cuda::std::bit_cast<bits_t>(b);
-    return a_bits == b_bits;
+    return ::cuda::std::memcmp(&a, &b, sizeof(T)) == 0;
   }
 };
 
@@ -230,8 +225,8 @@ struct Catch::StringMaker<cudaError>
   }
 };
 
-#include <c2h/custom_type.cuh>
-#include <c2h/generators.cuh>
+#include <c2h/custom_type.h>
+#include <c2h/generators.h>
 
 #define C2H_TEST_NAME_IMPL(NAME, PARAM) C2H_TEST_STR(NAME) "(" C2H_TEST_STR(PARAM) ")"
 

diff --git a/c2h/include/c2h/cpu_timer.cuh → c2h/include/c2h/cpu_timer.h b/c2h/include/c2h/cpu_timer.cuh → c2h/include/c2h/cpu_timer.h
diff --git a/c2h/include/c2h/custom_type.cuh → c2h/include/c2h/custom_type.h b/c2h/include/c2h/custom_type.cuh → c2h/include/c2h/custom_type.h
diff --git a/c2h/include/c2h/device_policy.cuh → c2h/include/c2h/device_policy.h b/c2h/include/c2h/device_policy.cuh → c2h/include/c2h/device_policy.h
@@ -27,21 +27,18 @@
 
 #pragma once
 
-#include <thrust/system/cuda/execution_policy.h>
-
-#include <type_traits>
+#include <thrust/execution_policy.h>
 
 #include <c2h/checked_allocator.cuh>
 
 namespace c2h
 {
-
-using device_policy_t =
-  typename std::remove_reference<decltype(thrust::cuda::par(checked_cuda_allocator<char>{}))>::type;
-static const device_policy_t device_policy = thrust::cuda::par(checked_cuda_allocator<char>{});
-
-using nosync_device_policy_t =
-  typename std::remove_reference<decltype(thrust::cuda::par_nosync(checked_cuda_allocator<char>{}))>::type;
-static const nosync_device_policy_t nosync_device_policy = thrust::cuda::par_nosync(checked_cuda_allocator<char>{});
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+static const auto device_policy        = thrust::cuda::par(checked_cuda_allocator<char>{});
+static const auto nosync_device_policy = thrust::cuda::par_nosync(checked_cuda_allocator<char>{});
+#else // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+static const auto device_policy        = thrust::device;
+static const auto nosync_device_policy = thrust::device;
+#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
 } // namespace c2h
diff --git a/c2h/include/c2h/extended_types.cuh → c2h/include/c2h/extended_types.h b/c2h/include/c2h/extended_types.cuh → c2h/include/c2h/extended_types.h
diff --git a/c2h/include/c2h/fill_striped.cuh → c2h/include/c2h/fill_striped.h b/c2h/include/c2h/fill_striped.cuh → c2h/include/c2h/fill_striped.h
diff --git a/c2h/include/c2h/generators.cuh → c2h/include/c2h/generators.h b/c2h/include/c2h/generators.cuh → c2h/include/c2h/generators.h
@@ -27,16 +27,16 @@
 
 #pragma once
 
-#include <cub/util_type.cuh> // __CUDA_FP8_TYPES_EXIST__
+#include <thrust/detail/config/device_system.h>
 
 #include <limits>
 
-#include <c2h/custom_type.cuh>
-#include <c2h/vector.cuh>
-
-#if defined(__CUDA_FP8_TYPES_EXIST__)
-#  include <cuda_fp8.h>
+#include <c2h/custom_type.h>
+#include <c2h/vector.h>
 
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {
 template <>
@@ -69,7 +69,8 @@ class numeric_limits<__nv_fp8_e5m2>
   }
 };
 } // namespace std
-#endif // defined(__CUDA_FP8_TYPES_EXIST__)
+#  endif // defined(__CUDA_FP8_TYPES_EXIST__)
+#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
 namespace c2h
 {