diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index cfc47edcfe..2de6508384 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -440,10 +440,15 @@ struct SimdMin<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x)), + ::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return __halves2half2(__float2half(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x))), __float2half(::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -470,11 +475,16 @@ struct SimdMin<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)), + ::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return cub::internal::halves2bfloat162( __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))), __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -521,10 +531,15 @@ struct SimdMax<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x)), + ::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmax2(a, b);), (return __halves2half2(__float2half(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x))), __float2half(::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -539,11 +554,16 @@ struct SimdMax<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)), + ::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmax2(a, b);), (return cub::internal::halves2bfloat162( __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))), __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -566,10 +586,14 @@ struct SimdSum<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(__half2float(a.x) + __half2float(b.x), __half2float(a.y) + __half2float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hadd2(a, b);), (return __halves2half2(__float2half(__half2float(a.x) + __half2float(b.x)), __float2half(__half2float(a.y) + __half2float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -584,11 +608,16 @@ struct SimdSum<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn( + __bfloat162float(a.x) + __bfloat162float(b.x), __bfloat162float(a.y) + __bfloat162float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET( NV_PROVIDES_SM_80, (return __hadd2(a, b);), (return cub::internal::halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) + __bfloat162float(b.x)), __float2bfloat16(__bfloat162float(a.y) + __bfloat162float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -611,10 +640,14 @@ struct SimdMul<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(__half2float(a.x) * __half2float(b.x), __half2float(a.y) * __half2float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hmul2(a, b);), (return __halves2half2(__float2half(__half2float(a.x) * __half2float(b.x)), __float2half(__half2float(a.y) * __half2float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -629,10 +662,15 @@ struct SimdMul<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn( + __bfloat162float(a.x) * __bfloat162float(b.x), __bfloat162float(a.y) * __bfloat162float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmul2(a, b);), (return halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) * __bfloat162float(b.x)), __float2bfloat16(__bfloat162float(a.y) * __bfloat162float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } };