diff --git a/benchmark/cublas/blas3/gemm.cpp b/benchmark/cublas/blas3/gemm.cpp index 603c01f95..61f044c03 100644 --- a/benchmark/cublas/blas3/gemm.cpp +++ b/benchmark/cublas/blas3/gemm.cpp @@ -96,8 +96,6 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; - constexpr const bool is_half = std::is_same_v; - cuda_scalar_t alpha_cuda = *reinterpret_cast(&alpha); cuda_scalar_t beta_cuda = *reinterpret_cast(&beta); diff --git a/benchmark/cublas/blas3/gemm_batched.cpp b/benchmark/cublas/blas3/gemm_batched.cpp index b80d11670..e84f1956b 100644 --- a/benchmark/cublas/blas3/gemm_batched.cpp +++ b/benchmark/cublas/blas3/gemm_batched.cpp @@ -34,12 +34,9 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmBatched(std::forward(args)...)); - } -#ifdef BLAS_ENABLE_HALF - else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasHgemmBatched(std::forward(args)...)); } -#endif return; } diff --git a/benchmark/cublas/blas3/gemm_batched_strided.cpp b/benchmark/cublas/blas3/gemm_batched_strided.cpp index 3f0d49a5f..484f10e72 100644 --- a/benchmark/cublas/blas3/gemm_batched_strided.cpp +++ b/benchmark/cublas/blas3/gemm_batched_strided.cpp @@ -34,12 +34,9 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmStridedBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmStridedBatched(std::forward(args)...)); - } -#ifdef BLAS_ENABLE_HALF - else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasHgemmStridedBatched(std::forward(args)...)); } -#endif return; } diff --git a/benchmark/rocblas/blas3/gemm.cpp b/benchmark/rocblas/blas3/gemm.cpp index c7a164da7..8868f44f7 100644 --- a/benchmark/rocblas/blas3/gemm.cpp +++ b/benchmark/rocblas/blas3/gemm.cpp @@ -34,12 +34,9 @@ static inline void rocblas_gemm_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm(std::forward(args)...)); - } -#ifdef BLAS_ENABLE_HALF - else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_hgemm(std::forward(args)...)); } -#endif return; } @@ -59,9 +56,6 @@ template void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, bool* success) { - // scalar_t if scalar_t!=sycl::half, float otherwise - using ref_scalar_t = - typename blas_benchmark::utils::ReferenceType::type; // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise using rocm_scalar_t = typename blas_benchmark::utils::RocblasType::type; @@ -111,55 +105,20 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, blas_benchmark::utils::HIPVector c_gpu( c_size, reinterpret_cast(c.data())); - constexpr const bool is_half = std::is_same_v; - - rocm_scalar_t alpha_rocm, beta_rocm; - - if constexpr (is_half) { -#ifdef BLAS_ENABLE_HALF - // sycl::half to rocblas__half - alpha_rocm = *reinterpret_cast(&alpha); - beta_rocm = *reinterpret_cast(&beta); - } else { -#endif - alpha_rocm = alpha; - beta_rocm = beta; - } + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK // Reference gemm - std::vector c_ref(n * m, 0); - std::vector c_temp(n * m, 0); - - if constexpr (is_half) { - // Float-type variables for reference ops - ref_scalar_t alpha_f = alpha; - ref_scalar_t beta_f = beta; - std::vector a_f(m * k); - std::vector b_f(k * n); - - // sycl::half to float reference type - std::transform(a.begin(), a.end(), a_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - std::transform(b.begin(), b.end(), b_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha_f, a_f.data(), lda, - b_f.data(), ldb, beta_f, c_ref.data(), ldc); + std::vector c_ref = c; + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, a.data(), lda, + b.data(), ldb, beta, c_ref.data(), ldc); + // Rocblas verification gemm + std::vector c_temp = c; + { blas_benchmark::utils::HIPVector c_temp_gpu( - m * n, reinterpret_cast(c_temp.data())); - - rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, - &alpha_rocm, a_gpu, lda, b_gpu, ldb, &beta_rocm, - c_temp_gpu, ldc); - - } else { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, a.data(), lda, - b.data(), ldb, beta, c_ref.data(), ldc); - - blas_benchmark::utils::HIPVector c_temp_gpu(m * n, - c_temp.data()); + c_size, reinterpret_cast(c_temp.data())); rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_gpu, lda, b_gpu, ldb, &beta_rocm, c_temp_gpu, ldc); diff --git a/benchmark/rocblas/blas3/gemm_batched.cpp b/benchmark/rocblas/blas3/gemm_batched.cpp index 50dc740a1..f73c176cf 100644 --- a/benchmark/rocblas/blas3/gemm_batched.cpp +++ b/benchmark/rocblas/blas3/gemm_batched.cpp @@ -34,12 +34,9 @@ static inline void rocblas_gemm_batched_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm_batched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm_batched(std::forward(args)...)); - } -#ifdef BLAS_ENABLE_HALF - else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_hgemm_batched(std::forward(args)...)); } -#endif return; } @@ -59,9 +56,6 @@ template void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, index_t t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, int batch_type_i, bool* success) { - // scalar_t if scalar_t!=sycl::half, float otherwise - using ref_scalar_t = - typename blas_benchmark::utils::ReferenceType::type; // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise using rocm_scalar_t = typename blas_benchmark::utils::RocblasType::type; @@ -116,68 +110,27 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, blas_benchmark::utils::HIPVectorBatched c_batched_gpu( c_size, batch_size); - constexpr const bool is_half = std::is_same_v; - - rocm_scalar_t alpha_rocm, beta_rocm; - - if constexpr (is_half) { -#ifdef BLAS_ENABLE_HALF - // sycl::half to rocblas__half - alpha_rocm = *reinterpret_cast(&alpha); - beta_rocm = *reinterpret_cast(&beta); - } else { -#endif - alpha_rocm = alpha; - beta_rocm = beta; - } - + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK - std::vector c_ref(c_size * batch_size, 0); - std::vector c_temp(c_size * batch_size, 0); - - if constexpr (is_half) { - // Float-type variables for reference ops - ref_scalar_t alpha_f = alpha; - ref_scalar_t beta_f = beta; - std::vector a_f(a_size * batch_size); - std::vector b_f(b_size * batch_size); - - // sycl::half to float reference type - std::transform(a.begin(), a.end(), a_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - std::transform(b.begin(), b.end(), b_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - - // Reference batched gemm - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha_f, - a_f.data() + batch * a_size, lda, - b_f.data() + batch * b_size, ldb, beta_f, - c_ref.data() + batch * c_size, ldc); - } + // Reference batched gemm + std::vector c_ref = c; + for (int batch = 0; batch < batch_size; batch++) { + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, + a.data() + batch * a_size, lda, + b.data() + batch * b_size, ldb, beta, + c_ref.data() + batch * c_size, ldc); + } - // Rocblas verification gemm_batched + // Rocblas verification + // gemm_batched + std::vector c_temp = c; + { blas_benchmark::utils::HIPVectorBatched c_temp_gpu( c_size, batch_size, reinterpret_cast(c_temp.data())); rocblas_gemm_batched_f( rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, lda, b_batched_gpu, ldb, &beta_rocm, c_temp_gpu, ldc, batch_size); - - } else { - // Reference batched gemm - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, - a.data() + batch * a_size, lda, - b.data() + batch * b_size, ldb, beta, - c_ref.data() + batch * c_size, ldc); - } - - // Rocblas verification gemm_batched - blas_benchmark::utils::HIPVectorBatched c_temp_gpu( - c_size, batch_size, c_temp.data()); - rocblas_gemm_batched_f( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, - lda, b_batched_gpu, ldb, &beta_rocm, c_temp_gpu, ldc, batch_size); } std::ostringstream err_stream; diff --git a/benchmark/rocblas/blas3/gemm_batched_strided.cpp b/benchmark/rocblas/blas3/gemm_batched_strided.cpp index 24077b73b..61526b246 100644 --- a/benchmark/rocblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/rocblas/blas3/gemm_batched_strided.cpp @@ -36,13 +36,10 @@ static inline void rocblas_gemm_strided_batched(args_t&&... args) { } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS( rocblas_dgemm_strided_batched(std::forward(args)...)); - } -#ifdef BLAS_ENABLE_HALF - else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS( rocblas_hgemm_strided_batched(std::forward(args)...)); } -#endif return; } @@ -65,9 +62,6 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, index_t stride_c_mul, bool* success) { - // scalar_t if scalar_t!=sycl::half, float otherwise - using ref_scalar_t = - typename blas_benchmark::utils::ReferenceType::type; // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise using rocm_scalar_t = typename blas_benchmark::utils::RocblasType::type; @@ -131,47 +125,21 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, blas_benchmark::utils::HIPVectorBatchedStrided c_batched_gpu( c_size, batch_size, stride_c, reinterpret_cast(c.data())); - constexpr const bool is_half = std::is_same_v; - - rocm_scalar_t alpha_rocm, beta_rocm; - - if constexpr (is_half) { -#ifdef BLAS_ENABLE_HALF - // sycl::half to rocblas__half - alpha_rocm = *reinterpret_cast(&alpha); - beta_rocm = *reinterpret_cast(&beta); - } else { -#endif - alpha_rocm = alpha; - beta_rocm = beta; - } - + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK - std::vector c_ref(size_c_batch, 0); - std::vector c_temp(size_c_batch, 0); - - if constexpr (is_half) { - // Float-type variables for reference ops - ref_scalar_t alpha_f = alpha; - ref_scalar_t beta_f = beta; - std::vector a_f(size_a_batch); - std::vector b_f(size_b_batch); - - // sycl::half to float reference type - std::transform(a.begin(), a.end(), a_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - std::transform(b.begin(), b.end(), b_f.begin(), - [](scalar_t x) { return (static_cast(x)); }); - - // Reference batched gemm - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha_f, - a_f.data() + batch * stride_a, lda, - b_f.data() + batch * stride_b, ldb, beta_f, - c_ref.data() + batch * stride_c, ldc); - } + // Reference gemm batched strided (strided loop of gemm) + std::vector c_ref = c; + for (int batch = 0; batch < batch_size; batch++) { + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, + a.data() + batch * stride_a, lda, + b.data() + batch * stride_b, ldb, beta, + c_ref.data() + batch * stride_c, ldc); + } - // Rocblas verification gemm_batched_strided + // Rocblas verification gemm_batched_strided + std::vector c_temp = c; + { blas_benchmark::utils::HIPVectorBatchedStrided c_temp_gpu(c_size, batch_size, stride_c, reinterpret_cast(c_temp.data())); @@ -179,23 +147,6 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, lda, stride_a, b_batched_gpu, ldb, stride_b, &beta_rocm, c_temp_gpu, ldc, stride_c, batch_size); - - } else { - // Reference batched gemm - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, - a.data() + batch * stride_a, lda, - b.data() + batch * stride_b, ldb, beta, - c_ref.data() + batch * stride_c, ldc); - } - - // Rocblas verification gemm_batched_strided - blas_benchmark::utils::HIPVectorBatchedStrided c_temp_gpu( - c_size, batch_size, stride_c, c_temp.data()); - rocblas_gemm_strided_batched( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, - lda, stride_a, b_batched_gpu, ldb, stride_b, &beta_rocm, c_temp_gpu, - ldc, stride_c, batch_size); } std::ostringstream err_stream; diff --git a/benchmark/rocblas/utils.hpp b/benchmark/rocblas/utils.hpp index 82af3eba7..adc0b3a9d 100644 --- a/benchmark/rocblas/utils.hpp +++ b/benchmark/rocblas/utils.hpp @@ -375,31 +375,20 @@ static inline std::tuple timef_hip(function_t func, } /** - * Reference type of the underlying tests data aimed to match the reference - * library in tests/benchmarks and random number generator APIs. + * Reference type of the underlying benchmark data aimed to match the + * rocm/rocBLAS scalar types. */ template struct RocblasType { using type = T; }; -template -struct ReferenceType { - using type = T; -}; - #ifdef BLAS_ENABLE_HALF -// When T is sycl::half, use float as type for reference BLAS implementations. +// When T is sycl::half, use rocBLAS's rocblas_half as type. template struct RocblasType>> { using type = rocblas_half; }; - -template -struct ReferenceType>> { - using type = float; -}; - #endif } // namespace utils } // namespace blas_benchmark diff --git a/test/blas_test.hpp b/test/blas_test.hpp index 8928eea4c..5d0193518 100644 --- a/test/blas_test.hpp +++ b/test/blas_test.hpp @@ -121,7 +121,8 @@ static inline scalar_t random_scalar(scalar_t rangeMin, scalar_t rangeMax) { static std::random_device rd; static std::default_random_engine gen(rd()); using random_scalar_t = - std::conditional_t, float, scalar_t>; + std::conditional_t, float, + scalar_t>; std::uniform_real_distribution dis(rangeMin, rangeMax); return dis(gen); }