Skip to content

Commit

Permalink
Avoid CUDA code in C2H (#2734)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Nov 11, 2024
1 parent f305eea commit 79f0ccc
Show file tree
Hide file tree
Showing 115 changed files with 714 additions and 659 deletions.
2 changes: 1 addition & 1 deletion c2h/catch2_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@

#define CUB_CONFIG_MAIN
#define CUB_EXCLUDE_CATCH2_HELPER_IMPL
#include <c2h/catch2_main.cuh>
#include <c2h/catch2_main.h>
2 changes: 1 addition & 1 deletion c2h/catch2_runner_helper.inl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#pragma once

//! @file
//! This file includes implementation of CUDA-specific utilities for custom Catch2 main When CMake is configured to
//! This file includes implementation of CUDA-specific utilities for custom Catch2 main. When CMake is configured to
//! include all the tests into a single executable, this file is only included into catch2_runner_helper.cu. When CMake
//! is configured to compile each test as a separate binary, this file is included into each test.

Expand Down
115 changes: 80 additions & 35 deletions c2h/generators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@

#include <cstdint>

#include <c2h/custom_type.cuh>
#include <c2h/device_policy.cuh>
#include <c2h/extended_types.cuh>
#include <c2h/fill_striped.cuh>
#include <c2h/generators.cuh>
#include <c2h/vector.cuh>
#include <c2h/custom_type.h>
#include <c2h/device_policy.h>
#include <c2h/extended_types.h>
#include <c2h/fill_striped.h>
#include <c2h/generators.h>
#include <c2h/vector.h>

#if C2H_HAS_CURAND
# include <curand.h>
Expand Down Expand Up @@ -118,7 +118,30 @@ private:
c2h::device_vector<float> m_distribution;
};

template <typename T, cub::Category = cub::Traits<T>::CATEGORY>
// TODO(bgruber): modelled after cub::Traits. We should generalize this somewhere into libcu++.
template <typename T>
struct is_floating_point : ::cuda::std::is_floating_point<T>
{};
#ifdef _CCCL_HAS_NVFP16
template <>
struct is_floating_point<__half> : ::cuda::std::true_type
{};
#endif // _CCCL_HAS_NVFP16
#ifdef _CCCL_HAS_NVBF16
template <>
struct is_floating_point<__nv_bfloat16> : ::cuda::std::true_type
{};
#endif // _CCCL_HAS_NVBF16
#ifdef __CUDA_FP8_TYPES_EXIST__
template <>
struct is_floating_point<__nv_fp8_e4m3> : ::cuda::std::true_type
{};
template <>
struct is_floating_point<__nv_fp8_e5m2> : ::cuda::std::true_type
{};
#endif // __CUDA_FP8_TYPES_EXIST__

template <typename T, bool = is_floating_point<T>::value>
struct random_to_item_t
{
float m_min;
Expand All @@ -136,7 +159,7 @@ struct random_to_item_t
};

template <typename T>
struct random_to_item_t<T, cub::FLOATING_POINT>
struct random_to_item_t<T, true>
{
using storage_t = ::cuda::std::_If<(sizeof(T) > 4), double, float>;
storage_t m_min;
Expand Down Expand Up @@ -182,6 +205,7 @@ RANDOM_TO_VEC_ITEM_SPEC(0, x);
RANDOM_TO_VEC_ITEM_SPEC(1, y);
RANDOM_TO_VEC_ITEM_SPEC(2, z);
RANDOM_TO_VEC_ITEM_SPEC(3, w);
#undef RANDOM_TO_VEC_ITEM_SPEC

generator_t::generator_t()
{
Expand Down Expand Up @@ -413,17 +437,37 @@ void init_key_segments(const c2h::device_vector<OffsetT>& segment_offsets, KeyT*
auto d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op);
auto d_range_sizes = thrust::make_transform_iterator(iota, offset_to_size_t<OffsetT>{d_offsets});

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes = 0;
// TODO(bgruber): replace by a non-CUB implementation
cub::DeviceCopy::Batched(
d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);

c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// TODO(bgruber): replace by a non-CUB implementation
cub::DeviceCopy::Batched(
d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);
cudaDeviceSynchronize();
#else // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
static_assert(sizeof(OffsetT) == 0, "Need to implement a non-CUB version of cub::DeviceCopy::Batched");
// TODO(bgruber): implement and *test* a non-CUB version, here is a sketch:
// thrust::for_each(
// thrust::device,
// thrust::counting_iterator<OffsetT>{0},
// thrust::counting_iterator<OffsetT>{total_segments},
// [&](OffsetT i) {
// const auto value = d_range_srcs[i];
// const auto start = d_range_sizes[i];
// const auto end = d_range_sizes[i + 1];
// for (auto j = start; j < end; ++j)
// {
// d_range_dsts[j] = value;
// }
// });
#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
}

template void init_key_segments(
Expand All @@ -434,15 +478,15 @@ template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, float* out, std::size_t element_size);
template void init_key_segments(
const c2h::device_vector<std::uint32_t>& segment_offsets, custom_type_state_t* out, std::size_t element_size);
#ifdef TEST_HALF_T
#ifdef _CCCL_HAS_NVFP16
template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, half_t* out, std::size_t element_size);
#endif
#endif // _CCCL_HAS_NVFP16

#ifdef TEST_BF_T
#ifdef _CCCL_HAS_NVBF16
template void
init_key_segments(const c2h::device_vector<std::uint32_t>& segment_offsets, bfloat16_t* out, std::size_t element_size);
#endif
#endif // _CCCL_HAS_NVBF16
} // namespace detail

template <typename T>
Expand Down Expand Up @@ -482,7 +526,6 @@ void gen(modulo_t mod, c2h::device_vector<T>& data)
}

#define INSTANTIATE_RND(TYPE) template void gen<TYPE>(seed_t, c2h::device_vector<TYPE> & data, TYPE min, TYPE max)

#define INSTANTIATE_MOD(TYPE) template void gen<TYPE>(modulo_t, c2h::device_vector<TYPE> & data)

#define INSTANTIATE(TYPE) \
Expand All @@ -509,14 +552,19 @@ INSTANTIATE(double);
INSTANTIATE(bool);
INSTANTIATE(char);

#ifdef TEST_HALF_T
#ifdef _CCCL_HAS_NVFP16
INSTANTIATE(half_t);
#endif
#endif // _CCCL_HAS_NVFP16

#ifdef TEST_BF_T
#ifdef _CCCL_HAS_NVBF16
INSTANTIATE(bfloat16_t);
#endif
#endif // _CCCL_HAS_NVBF16

#undef INSTANTIATE_RND
#undef INSTANTIATE_MOD
#undef INSTANTIATE

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
template <typename T, int VecItem>
struct vec_gen_helper_t;

Expand Down Expand Up @@ -546,14 +594,14 @@ struct vec_gen_helper_t
}
};

#define VEC_SPECIALIZATION(TYPE, SIZE) \
template <> \
void gen<TYPE##SIZE>(seed_t seed, c2h::device_vector<TYPE##SIZE> & data, TYPE##SIZE min, TYPE##SIZE max) \
{ \
generator_t& generator = generator_t::instance(); \
generator.prepare_random_generator(seed, data.size()); \
vec_gen_helper_t<TYPE##SIZE, SIZE - 1>::gen(data, min, max); \
}
# define VEC_SPECIALIZATION(TYPE, SIZE) \
template <> \
void gen<TYPE##SIZE>(seed_t seed, c2h::device_vector<TYPE##SIZE> & data, TYPE##SIZE min, TYPE##SIZE max) \
{ \
generator_t& generator = generator_t::instance(); \
generator.prepare_random_generator(seed, data.size()); \
vec_gen_helper_t<TYPE##SIZE, SIZE - 1>::gen(data, min, max); \
}

VEC_SPECIALIZATION(char, 2);
VEC_SPECIALIZATION(char, 3);
Expand Down Expand Up @@ -620,19 +668,16 @@ struct vec_gen_t
}
};

#define VEC_GEN_MOD_SPECIALIZATION(VEC_TYPE, SCALAR_TYPE) \
template <> \
void gen<VEC_TYPE>(modulo_t mod, c2h::device_vector<VEC_TYPE> & data) \
{ \
thrust::tabulate(c2h::device_policy, data.begin(), data.end(), vec_gen_t<VEC_TYPE, SCALAR_TYPE>{mod.get()}); \
}
# define VEC_GEN_MOD_SPECIALIZATION(VEC_TYPE, SCALAR_TYPE) \
template <> \
void gen<VEC_TYPE>(modulo_t mod, c2h::device_vector<VEC_TYPE> & data) \
{ \
thrust::tabulate(c2h::device_policy, data.begin(), data.end(), vec_gen_t<VEC_TYPE, SCALAR_TYPE>{mod.get()}); \
}

VEC_GEN_MOD_SPECIALIZATION(short2, short);

VEC_GEN_MOD_SPECIALIZATION(uchar3, unsigned char);

VEC_GEN_MOD_SPECIALIZATION(ulonglong4, unsigned long long);

VEC_GEN_MOD_SPECIALIZATION(ushort4, unsigned short);

#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
} // namespace c2h
14 changes: 10 additions & 4 deletions c2h/include/c2h/catch2_main.cuh → c2h/include/c2h/catch2_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#pragma once

#include <thrust/detail/config/device_system.h>

#include <iostream>

//! @file
Expand All @@ -41,16 +43,19 @@
#include <catch2/catch.hpp>

#if defined(CUB_CONFIG_MAIN)
# include <c2h/catch2_runner_helper.cuh>
# if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <c2h/catch2_runner_helper.h>

# if !defined(CUB_EXCLUDE_CATCH2_HELPER_IMPL)
# include "catch2_runner_helper.inl"
# endif
# ifndef CUB_EXCLUDE_CATCH2_HELPER_IMPL
# include "catch2_runner_helper.inl"
# endif // !CUB_EXCLUDE_CATCH2_HELPER_IMPL
# endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA

int main(int argc, char* argv[])
{
Catch::Session session;

# if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
int device_id{};

// Build a new parser on top of Catch's
Expand All @@ -65,6 +70,7 @@ int main(int argc, char* argv[])
}

set_device(device_id);
# endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
return session.run(argc, argv);
}
#endif
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,27 @@

#pragma once

#include <cub/util_compiler.cuh>
#include <cuda/std/detail/__config>

#include <cuda/std/__cccl/diagnostic.h>
#include <cuda/std/bit>
#include <cuda/std/cmath>
#include <cuda/std/type_traits>
#include <cuda/std/utility>

#include <cstdint>
#include <cstdlib>
#include <tuple>
#include <type_traits>

#if __CUDACC_VER_MAJOR__ == 11
_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variableds
_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variables
#endif // nvcc-11

#include <cuda/std/bit>
#include <cuda/std/cmath>
#include <cuda/std/type_traits>
#include <cuda/std/utility>

#include <c2h/catch2_main.cuh>
#include <c2h/device_policy.cuh>
#include <c2h/test_util_vec.cuh>
#include <c2h/utility.cuh>
#include <c2h/vector.cuh>
#include <c2h/catch2_main.h>
#include <c2h/device_policy.h>
#include <c2h/test_util_vec.h>
#include <c2h/utility.h>
#include <c2h/vector.h>

#ifndef VAR_IDX
# define VAR_IDX 0
Expand Down Expand Up @@ -136,10 +134,7 @@ struct bitwise_equal
template <typename T>
bool operator()(const T& a, const T& b) const
{
using bits_t = typename cub::Traits<T>::UnsignedBits;
bits_t a_bits = ::cuda::std::bit_cast<bits_t>(a);
bits_t b_bits = ::cuda::std::bit_cast<bits_t>(b);
return a_bits == b_bits;
return ::cuda::std::memcmp(&a, &b, sizeof(T)) == 0;
}
};

Expand Down Expand Up @@ -230,8 +225,8 @@ struct Catch::StringMaker<cudaError>
}
};

#include <c2h/custom_type.cuh>
#include <c2h/generators.cuh>
#include <c2h/custom_type.h>
#include <c2h/generators.h>

#define C2H_TEST_NAME_IMPL(NAME, PARAM) C2H_TEST_STR(NAME) "(" C2H_TEST_STR(PARAM) ")"

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,18 @@

#pragma once

#include <thrust/system/cuda/execution_policy.h>

#include <type_traits>
#include <thrust/execution_policy.h>

#include <c2h/checked_allocator.cuh>

namespace c2h
{

using device_policy_t =
typename std::remove_reference<decltype(thrust::cuda::par(checked_cuda_allocator<char>{}))>::type;
static const device_policy_t device_policy = thrust::cuda::par(checked_cuda_allocator<char>{});

using nosync_device_policy_t =
typename std::remove_reference<decltype(thrust::cuda::par_nosync(checked_cuda_allocator<char>{}))>::type;
static const nosync_device_policy_t nosync_device_policy = thrust::cuda::par_nosync(checked_cuda_allocator<char>{});
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
static const auto device_policy = thrust::cuda::par(checked_cuda_allocator<char>{});
static const auto nosync_device_policy = thrust::cuda::par_nosync(checked_cuda_allocator<char>{});
#else // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
static const auto device_policy = thrust::device;
static const auto nosync_device_policy = thrust::device;
#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA

} // namespace c2h
File renamed without changes.
File renamed without changes.
15 changes: 8 additions & 7 deletions c2h/include/c2h/generators.cuh → c2h/include/c2h/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@

#pragma once

#include <cub/util_type.cuh> // __CUDA_FP8_TYPES_EXIST__
#include <thrust/detail/config/device_system.h>

#include <limits>

#include <c2h/custom_type.cuh>
#include <c2h/vector.cuh>

#if defined(__CUDA_FP8_TYPES_EXIST__)
# include <cuda_fp8.h>
#include <c2h/custom_type.h>
#include <c2h/vector.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <cub/util_type.cuh> // for <cuda_fp8.h>
# if defined(__CUDA_FP8_TYPES_EXIST__)
namespace std
{
template <>
Expand Down Expand Up @@ -69,7 +69,8 @@ class numeric_limits<__nv_fp8_e5m2>
}
};
} // namespace std
#endif // defined(__CUDA_FP8_TYPES_EXIST__)
# endif // defined(__CUDA_FP8_TYPES_EXIST__)
#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA

namespace c2h
{
Expand Down
Loading

0 comments on commit 79f0ccc

Please sign in to comment.