diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index cfc47edcfe7..f720ed9c078 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -440,10 +440,15 @@ struct SimdMin<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12.0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x)), + ::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12.0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return __halves2half2(__float2half(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x))), __float2half(::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12.0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -470,11 +475,16 @@ struct SimdMin<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12.0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)), + ::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12.0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return cub::internal::halves2bfloat162( __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))), __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12.0) || !_CCCL_CUDA_COMPILER_NVHPC } };