Skip to content

Commit

Permalink
Catch2 segmented sort (NVIDIA#1484)
Browse files Browse the repository at this point in the history
* Make NullType more convenient.

- Allow explicit construction from any type to facilitate generic programming.
  A similar assignment operator already existed.
- Make ==/!= operators into friend functions. This fixes compat with thrust::device_reference
  in testing code.

* Make half_t and bfloat16_t device_reference compatible.

thrust::device_reference does not compile when operator== is a member function.
Changing to friend functions WAR the issue.

* Allow conversion of double -> half_t/bfloat116_t.

* Make half/bfloat16 wrapper ctors expliict.

This should fix some ambiguous overload issues we're seeing on CI.

* Add c2h::nosync_device_policy.

* Add simple c2h::type_name utility.

This is intended to help with Catch2's CAPTURE macro:

```
CAPTURE(c2h::type_name<KeyT>(), c2h::type_name<ValueT>);

output on failure:
  c2h::type_name<KeyT>() := "h"
  c2h::type_name<ValueT>() := "N3cub25CUB_200300_600_700_800_NS8NullTypeE"
```

ABI demangling would be a nice improvement for later.

* Improvements to c2h::cpu_timer.

- Add macros that can be enabled using `-DC2H_DEBUG_TIMING`.
- Add RAII scoped_cpu_timer + macro.
- Increase precision of output from ms -> us.

* Port DeviceSegmentedSort tests to catch2.

* Address live-review feedback.

* Use void-cast instead of cuda::std::ignore to WAR warnings.

 /home/coder/cccl/cub/test/catch2_segmented_sort_helper.cuh(503): error NVIDIA#174-D: expression has no effect

* Add support for unstable sort, address review feedback.

* Remove duplicate increment, leftover after while->for conversion.

* Update CUB_IF_CONSTEXPR to _CCCL_IF_CONSTEXPR
  • Loading branch information
alliepiper authored Mar 27, 2024
1 parent 20a4866 commit e3758cf
Show file tree
Hide file tree
Showing 14 changed files with 2,073 additions and 1,985 deletions.
26 changes: 20 additions & 6 deletions cub/cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -280,16 +280,30 @@ struct CUB_DEPRECATED RemoveQualifiers
*/
struct NullType
{
using value_type = NullType;
using value_type = NullType;

template <typename T>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE NullType& operator =(const T&) { return *this; }
NullType() = default;

_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator ==(const NullType&) { return true; }
template <typename T>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE explicit NullType(const T&)
{}

_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator !=(const NullType&) { return false; }
};
template <typename T>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE NullType& operator=(const T&)
{
return *this;
}

friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const NullType&, const NullType&)
{
return true;
}

friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const NullType&, const NullType&)
{
return false;
}
};

/**
* \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
Expand Down
25 changes: 16 additions & 9 deletions cub/test/bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,28 @@ struct bfloat16_t

/// Constructor from __nv_bfloat16
__host__ __device__ __forceinline__
bfloat16_t(const __nv_bfloat16 &other)
explicit bfloat16_t(const __nv_bfloat16 &other)
{
__x = reinterpret_cast<const uint16_t&>(other);
}

/// Constructor from integer
__host__ __device__ __forceinline__
bfloat16_t(int a)
explicit bfloat16_t(int a)
{
*this = bfloat16_t(float(a));
}

/// Constructor from std::size_t
__host__ __device__ __forceinline__
bfloat16_t(std::size_t a)
explicit bfloat16_t(std::size_t a)
{
*this = bfloat16_t(float(a));
}

/// Constructor from double
__host__ __device__ __forceinline__
explicit bfloat16_t(double a)
{
*this = bfloat16_t(float(a));
}
Expand All @@ -85,7 +92,7 @@ struct bfloat16_t
typename = typename ::cuda::std::enable_if<
::cuda::std::is_same<T, unsigned long long int>::value
&& (!::cuda::std::is_same<std::size_t, unsigned long long int>::value)>::type>
__host__ __device__ __forceinline__ bfloat16_t(T a)
__host__ __device__ __forceinline__ explicit bfloat16_t(T a)
{
*this = bfloat16_t(float(a));
}
Expand All @@ -95,7 +102,7 @@ struct bfloat16_t

/// Constructor from float
__host__ __device__ __forceinline__
bfloat16_t(float a)
explicit bfloat16_t(float a)
{
// Refrence:
// https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51
Expand Down Expand Up @@ -142,16 +149,16 @@ struct bfloat16_t

/// Equality
__host__ __device__ __forceinline__
bool operator ==(const bfloat16_t &other) const
friend bool operator ==(const bfloat16_t &a, const bfloat16_t &b)
{
return (this->__x == other.__x);
return (a.__x == b.__x);
}

/// Inequality
__host__ __device__ __forceinline__
bool operator !=(const bfloat16_t &other) const
friend bool operator !=(const bfloat16_t &a, const bfloat16_t &b)
{
return (this->__x != other.__x);
return (a.__x != b.__x);
}

/// Assignment by sum
Expand Down
53 changes: 48 additions & 5 deletions cub/test/c2h/cpu_timer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,30 @@

#pragma once

#include <cuda/std/tuple>

#include <chrono>
#include <iostream>
#include <string>

//#define C2H_DEBUG_TIMING

#ifdef C2H_DEBUG_TIMING
# define C2H_TIME_SECTION_INIT() \
c2h::cpu_timer _c2h_timer_; \
(void) _c2h_timer_
# define C2H_TIME_SECTION_RESET() _c2h_timer_.reset()
# define C2H_TIME_SECTION(label) _c2h_timer_.print_elapsed_seconds_and_reset(label)
# define C2H_TIME_SCOPE(label) \
c2h::scoped_cpu_timer _c2h_scoped_cpu_timer_(label); \
(void) _c2h_scoped_cpu_timer_
#else
# define C2H_TIME_SECTION_INIT() /* no-op */ []() {}()
# define C2H_TIME_SECTION_RESET() /* no-op */ []() {}()
# define C2H_TIME_SECTION(label) /* no-op */ []() {}()
# define C2H_TIME_SCOPE(label) /* no-op */ []() {}()
#endif

namespace c2h
{

Expand All @@ -40,7 +60,7 @@ class cpu_timer

public:
cpu_timer()
: m_start(std::chrono::high_resolution_clock::now())
: m_start(std::chrono::high_resolution_clock::now())
{}

void reset()
Expand All @@ -51,20 +71,43 @@ public:
int elapsed_ms() const
{
auto duration = std::chrono::high_resolution_clock::now() - m_start;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(duration);
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(duration);
return static_cast<int>(ms.count());
}

std::uint64_t elapsed_us() const
{
auto duration = std::chrono::high_resolution_clock::now() - m_start;
auto us = std::chrono::duration_cast<std::chrono::microseconds>(duration);
return static_cast<std::uint64_t>(us.count());
}

void print_elapsed_seconds(const std::string& label)
{
std::cout << label << ": " << (this->elapsed_ms() / 1000.f) << "s\n";
printf("%0.6f s: %s\n", this->elapsed_us() / 1000000.f, label.c_str());
}

void print_elapsed_seconds_and_reset(const std::string& label)
void print_elapsed_seconds_and_reset(const std::string& label)
{
this->print_elapsed_seconds(label);
this->reset();
}
};

}
class scoped_cpu_timer
{
cpu_timer m_timer;
std::string m_label;

public:
explicit scoped_cpu_timer(std::string label)
: m_label(std::move(label))
{}

~scoped_cpu_timer()
{
m_timer.print_elapsed_seconds(m_label);
}
};

} // namespace c2h
10 changes: 7 additions & 3 deletions cub/test/c2h/device_policy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

#pragma once

#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>

#include <type_traits>

Expand All @@ -36,8 +36,12 @@
namespace c2h
{

using device_policy_t = typename std::remove_reference<decltype(thrust::device(checked_cuda_allocator<char>{}))>::type;
using device_policy_t =
typename std::remove_reference<decltype(thrust::cuda::par(checked_cuda_allocator<char>{}))>::type;
static const device_policy_t device_policy = thrust::cuda::par(checked_cuda_allocator<char>{});

static const device_policy_t device_policy = thrust::device(checked_cuda_allocator<char>{});
using nosync_device_policy_t =
typename std::remove_reference<decltype(thrust::cuda::par_nosync(checked_cuda_allocator<char>{}))>::type;
static const nosync_device_policy_t nosync_device_policy = thrust::cuda::par_nosync(checked_cuda_allocator<char>{});

} // namespace c2h
2 changes: 2 additions & 0 deletions cub/test/c2h/generators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class value_wrapper_t
T m_val{};

public:
using value_type = T;

explicit value_wrapper_t(T val)
: m_val(val)
{}
Expand Down
8 changes: 8 additions & 0 deletions cub/test/c2h/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#pragma once

#include <cstring>
#include <string>
#include <typeinfo>

namespace c2h
{
Expand All @@ -46,4 +48,10 @@ To bit_cast(const From& in)
return out;
}

template <typename T>
std::string type_name()
{
return typeid(T).name();
}

}
Loading

0 comments on commit e3758cf

Please sign in to comment.