diff --git a/CMakeLists.txt b/CMakeLists.txt index 21340430c..0b0143e64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,7 +106,14 @@ if(IMGDNN_DIR) endif() option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON) -option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for supported operators" ON) +option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF) +option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF) + +if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU")) + OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET))) + set(BLAS_ENABLE_HALF OFF) + message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled") +endif() # CmakeFunctionHelper has to be included after any options that it depends on are declared. # These include: @@ -117,6 +124,8 @@ option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for supported op # * BLAS_INDEX_TYPES # * NAIVE_GEMM # * BLAS_ENABLE_COMPLEX +# * BLAS_ENABLE_HALF + include(CmakeFunctionHelper) if (INSTALL_HEADER_ONLY) diff --git a/README.md b/README.md index bba1a6bc0..5186b98d0 100644 --- a/README.md +++ b/README.md @@ -462,9 +462,10 @@ Some of the supported options are: | `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default | | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) | | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) | -| `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` | +| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` | | `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` | -| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`ON` by default) | +| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) | +| `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) | ## ComputeCpp Compilation *(Deprecated)* diff --git a/benchmark/cublas/CMakeLists.txt b/benchmark/cublas/CMakeLists.txt index ad3b4ed05..55c6bfa95 100644 --- a/benchmark/cublas/CMakeLists.txt +++ b/benchmark/cublas/CMakeLists.txt @@ -75,7 +75,15 @@ set(sources ) # Operators supporting COMPLEX types benchmarking -set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided") +set(CPLX_OPS "gemm" + "gemm_batched" + "gemm_batched_strided") + +# Operators supporting HALF type benchmarking +set(HALF_DATA_OPS "gemm" + "gemm_batched" + "gemm_batched_strided" +) # Add individual benchmarks for each method foreach(cublas_bench ${sources}) @@ -83,10 +91,11 @@ foreach(cublas_bench ${sources}) add_executable(bench_cublas_${bench_cublas_exec} ${cublas_bench} main.cpp) target_link_libraries(bench_cublas_${bench_cublas_exec} PRIVATE benchmark CUDA::toolkit CUDA::cublas CUDA::cudart portblas Clara::Clara bench_info) target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE}) - if(${BLAS_ENABLE_COMPLEX}) - if("${bench_cublas_exec}" IN_LIST CPLX_OPS) - target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) - endif() + if((${BLAS_ENABLE_COMPLEX}) AND ("${bench_cublas_exec}" IN_LIST CPLX_OPS)) + target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) + endif() + if((${BLAS_ENABLE_HALF}) AND ("${bench_cublas_exec}" IN_LIST HALF_DATA_OPS)) + target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_HALF=1) endif() add_sycl_to_target( TARGET bench_cublas_${bench_cublas_exec} diff --git a/benchmark/cublas/blas3/gemm.cpp b/benchmark/cublas/blas3/gemm.cpp index c74c9e98e..61f044c03 100644 --- a/benchmark/cublas/blas3/gemm.cpp +++ b/benchmark/cublas/blas3/gemm.cpp @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemm(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemm(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CUBLAS_CHECK(cublasHgemm(std::forward(args)...)); } return; } @@ -54,6 +56,10 @@ template void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, bool* success) { + // scalar_t if scalar_t!=sycl::half, cuda::__half otherwise + using cuda_scalar_t = + typename blas_benchmark::utils::CudaType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -80,13 +86,19 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, std::vector c = blas_benchmark::utils::const_data(m * n, 0); - blas_benchmark::utils::CUDAVector a_gpu(m * k, a.data()); - blas_benchmark::utils::CUDAVector b_gpu(k * n, b.data()); - blas_benchmark::utils::CUDAVector c_gpu(n * m, c.data()); + blas_benchmark::utils::CUDAVector a_gpu( + m * k, reinterpret_cast(a.data())); + blas_benchmark::utils::CUDAVector b_gpu( + k * n, reinterpret_cast(b.data())); + blas_benchmark::utils::CUDAVector c_gpu( + n * m, reinterpret_cast(c.data())); cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; + cuda_scalar_t alpha_cuda = *reinterpret_cast(&alpha); + cuda_scalar_t beta_cuda = *reinterpret_cast(&beta); + #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results std::vector c_ref = c; @@ -94,10 +106,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, beta, c_ref.data(), ldc); std::vector c_temp = c; { - blas_benchmark::utils::CUDAVector c_temp_gpu(m * n, - c_temp.data()); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, b_gpu, ldb, &beta, c_temp_gpu, ldc); + blas_benchmark::utils::CUDAVector c_temp_gpu( + m * n, reinterpret_cast(c_temp.data())); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, b_gpu, ldb, &beta_cuda, c_temp_gpu, + ldc); } std::ostringstream err_stream; @@ -107,9 +120,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, *success = false; }; #endif + auto blas_warmup = [&]() -> void { - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, b_gpu, ldb, &beta, c_gpu, ldc); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, b_gpu, ldb, &beta_cuda, c_gpu, ldc); return; }; @@ -120,8 +134,8 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, auto blas_method_def = [&]() -> std::vector { CUDA_CHECK(cudaEventRecord(start)); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, b_gpu, ldb, &beta, c_gpu, ldc); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, b_gpu, ldb, &beta_cuda, c_gpu, ldc); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); return std::vector{start, stop}; diff --git a/benchmark/cublas/blas3/gemm_batched.cpp b/benchmark/cublas/blas3/gemm_batched.cpp index c0c50631f..d1a4e3ae2 100644 --- a/benchmark/cublas/blas3/gemm_batched.cpp +++ b/benchmark/cublas/blas3/gemm_batched.cpp @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmBatched(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CUBLAS_CHECK(cublasHgemmBatched(std::forward(args)...)); } return; } @@ -54,6 +56,10 @@ template void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, index_t t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_count, int batch_type_i, bool* success) { + // scalar_t if scalar_t!=sycl::half, cuda::__half otherwise + using cuda_scalar_t = + typename blas_benchmark::utils::CudaType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -84,17 +90,19 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, std::vector c = blas_benchmark::utils::const_data(m * n * batch_count, 0); - blas_benchmark::utils::CUDAVectorBatched d_A_array(m * k, - batch_count, a); - blas_benchmark::utils::CUDAVectorBatched d_B_array(k * n, - batch_count, b); - blas_benchmark::utils::CUDAVectorBatched d_C_array(m * n, - batch_count); + blas_benchmark::utils::CUDAVectorBatched d_A_array( + m * k, batch_count, reinterpret_cast(a.data())); + blas_benchmark::utils::CUDAVectorBatched d_B_array( + k * n, batch_count, reinterpret_cast(b.data())); + blas_benchmark::utils::CUDAVectorBatched d_C_array( + m * n, batch_count); cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; + cuda_scalar_t alpha_cuda = *reinterpret_cast(&alpha); + cuda_scalar_t beta_cuda = *reinterpret_cast(&beta); + #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results { @@ -110,13 +118,12 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, } std::vector c_temp(m * n * batch_count); - { - blas_benchmark::utils::CUDAVectorBatched c_temp_gpu( - n * m, batch_count, c_temp); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, + blas_benchmark::utils::CUDAVectorBatched c_temp_gpu( + n * m, batch_count, reinterpret_cast(c_temp.data())); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, d_A_array.get_batch_array(), lda, - d_B_array.get_batch_array(), ldb, &beta, + d_B_array.get_batch_array(), ldb, &beta_cuda, c_temp_gpu.get_batch_array(), ldc, batch_count); } @@ -128,14 +135,13 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, *success = false; }; } - } // close scope for verify benchmark #endif auto blas_warmup = [&]() -> void { - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, d_A_array.get_batch_array(), lda, - d_B_array.get_batch_array(), ldb, &beta, + d_B_array.get_batch_array(), ldb, &beta_cuda, d_C_array.get_batch_array(), ldc, batch_count); return; }; @@ -146,9 +152,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, auto blas_method_def = [&]() -> std::vector { CUDA_CHECK(cudaEventRecord(start)); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, d_A_array.get_batch_array(), lda, - d_B_array.get_batch_array(), ldb, &beta, + d_B_array.get_batch_array(), ldb, &beta_cuda, d_C_array.get_batch_array(), ldc, batch_count); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); diff --git a/benchmark/cublas/blas3/gemm_batched_strided.cpp b/benchmark/cublas/blas3/gemm_batched_strided.cpp index beb81fb4c..846fd7806 100644 --- a/benchmark/cublas/blas3/gemm_batched_strided.cpp +++ b/benchmark/cublas/blas3/gemm_batched_strided.cpp @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmStridedBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmStridedBatched(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CUBLAS_CHECK(cublasHgemmStridedBatched(std::forward(args)...)); } return; } @@ -55,6 +57,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, index_t stride_c_mul, bool* success) { + // scalar_t if scalar_t!=sycl::half, cuda::__half otherwise + using cuda_scalar_t = + typename blas_benchmark::utils::CudaType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -103,14 +109,19 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, std::vector c = blas_benchmark::utils::const_data(size_c_batch, 0); - blas_benchmark::utils::CUDAVector a_gpu(size_a_batch, a.data()); - blas_benchmark::utils::CUDAVector b_gpu(size_b_batch, b.data()); - blas_benchmark::utils::CUDAVector c_gpu(size_c_batch, c.data()); + blas_benchmark::utils::CUDAVector a_gpu( + size_a_batch, reinterpret_cast(a.data())); + blas_benchmark::utils::CUDAVector b_gpu( + size_b_batch, reinterpret_cast(b.data())); + blas_benchmark::utils::CUDAVector c_gpu( + size_c_batch, reinterpret_cast(c.data())); cublasOperation_t c_t_a = trA ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t c_t_b = trB ? CUBLAS_OP_N : CUBLAS_OP_T; + cuda_scalar_t alpha_cuda = *reinterpret_cast(&alpha); + cuda_scalar_t beta_cuda = *reinterpret_cast(&beta); + #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results std::vector c_ref = c; @@ -123,11 +134,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, std::vector c_temp = c; { - blas_benchmark::utils::CUDAVector c_temp_gpu(size_c_batch, - c_temp.data()); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, stride_a, b_gpu, ldb, stride_b, &beta, - c_temp_gpu, ldc, stride_c, batch_size); + blas_benchmark::utils::CUDAVector c_temp_gpu( + size_c_batch, reinterpret_cast(c_temp.data())); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, stride_a, b_gpu, ldb, stride_b, + &beta_cuda, c_temp_gpu, ldc, stride_c, batch_size); } std::ostringstream err_stream; @@ -140,9 +151,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, #endif auto blas_warmup = [&]() -> void { - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, stride_a, b_gpu, ldb, stride_b, &beta, c_gpu, - ldc, stride_c, batch_size); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, stride_a, b_gpu, ldb, stride_b, + &beta_cuda, c_gpu, ldc, stride_c, batch_size); return; }; @@ -152,9 +163,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, auto blas_method_def = [&]() -> std::vector { CUDA_CHECK(cudaEventRecord(start)); - cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu, - lda, stride_a, b_gpu, ldb, stride_b, &beta, c_gpu, - ldc, stride_c, batch_size); + cublas_routine(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda, + a_gpu, lda, stride_a, b_gpu, ldb, stride_b, + &beta_cuda, c_gpu, ldc, stride_c, batch_size); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); return std::vector{start, stop}; diff --git a/benchmark/cublas/utils.hpp b/benchmark/cublas/utils.hpp index 362fdce51..c658caff4 100644 --- a/benchmark/cublas/utils.hpp +++ b/benchmark/cublas/utils.hpp @@ -36,7 +36,9 @@ #include #include #include +#include #include + // Forward declare methods that we use in `benchmark.cpp`, but define in // `main.cpp` @@ -274,6 +276,21 @@ static inline std::tuple timef_cuda(function_t func, return std::make_tuple(overall_time, static_cast(elapsed_time) * 1E6); } +/** + * Reference type of the underlying benchmark data aimed to match the + * cuda/cuBLAS scalar types. + */ +template +struct CudaType { + using type = T; +}; + +// When T is sycl::half, use cuda's __cuda as type. +template +struct CudaType>> { + using type = __half; +}; + } // namespace utils } // namespace blas_benchmark diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 8cb498ad6..daaeb4674 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -77,7 +77,17 @@ if(${BLAS_ENABLE_EXTENSIONS}) endif() # Operators supporting COMPLEX types benchmarking -set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided") +set(CPLX_OPS "gemm" + "gemm_batched" + "gemm_batched_strided") + +# Operators supporting HALF type benchmarking +set(HALF_DATA_OPS "axpy" + "scal" + "gemm" + "gemm_batched" + "gemm_batched_strided" + ) # Add individual benchmarks for each method foreach(portblas_bench ${sources}) @@ -85,10 +95,11 @@ foreach(portblas_bench ${sources}) add_executable(bench_${bench_exec} ${portblas_bench} main.cpp) target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info) target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE}) - if(${BLAS_ENABLE_COMPLEX}) - if("${bench_exec}" IN_LIST CPLX_OPS) - target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) - endif() + if((${BLAS_ENABLE_COMPLEX}) AND ("${bench_exec}" IN_LIST CPLX_OPS)) + target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) + endif() + if((${BLAS_ENABLE_HALF}) AND ("${bench_exec}" IN_LIST HALF_DATA_OPS)) + target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_ENABLE_HALF=1) endif() add_sycl_to_target( TARGET bench_${bench_exec} diff --git a/benchmark/portblas/blas1/axpy.cpp b/benchmark/portblas/blas1/axpy.cpp index 6b24b2a60..aee6cca52 100644 --- a/benchmark/portblas/blas1/axpy.cpp +++ b/benchmark/portblas/blas1/axpy.cpp @@ -42,6 +42,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + state.SkipWithError("Unsupported fp16 (half) on this device."); + } + // Create data std::vector v1 = blas_benchmark::utils::random_data(size); std::vector v2 = blas_benchmark::utils::random_data(size); diff --git a/benchmark/portblas/blas1/scal.cpp b/benchmark/portblas/blas1/scal.cpp index f05ef7145..e9c8b6646 100644 --- a/benchmark/portblas/blas1/scal.cpp +++ b/benchmark/portblas/blas1/scal.cpp @@ -42,6 +42,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + state.SkipWithError("Unsupported fp16 (half) on this device."); + } + // Create data std::vector v1 = blas_benchmark::utils::random_data(size); auto alpha = blas_benchmark::utils::random_scalar(); @@ -120,8 +125,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - size, mem_type).c_str(), + blas_benchmark::utils::get_name(size, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } @@ -133,7 +138,8 @@ void register_benchmark(blas_benchmark::Args& args, auto scal_params = blas_benchmark::utils::get_blas1_params(args); register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, scal_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + scal_params); #ifdef SB_ENABLE_USM register_benchmark( sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, scal_params); diff --git a/benchmark/portblas/blas3/gemm.cpp b/benchmark/portblas/blas3/gemm.cpp index 27bb90650..12352bcb3 100644 --- a/benchmark/portblas/blas3/gemm.cpp +++ b/benchmark/portblas/blas3/gemm.cpp @@ -55,6 +55,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + state.SkipWithError("Unsupported fp16 (half) on this device."); + } + // Matrices std::vector a = blas_benchmark::utils::random_data(m * k); std::vector b = blas_benchmark::utils::random_data(k * n); diff --git a/benchmark/portblas/blas3/gemm_batched.cpp b/benchmark/portblas/blas3/gemm_batched.cpp index aabd9449a..21a7c47a6 100644 --- a/benchmark/portblas/blas3/gemm_batched.cpp +++ b/benchmark/portblas/blas3/gemm_batched.cpp @@ -90,6 +90,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + state.SkipWithError("Unsupported fp16 (half) on this device."); + } + // Matrices std::vector a = blas_benchmark::utils::random_data(m * k * batch_size); diff --git a/benchmark/portblas/blas3/gemm_batched_strided.cpp b/benchmark/portblas/blas3/gemm_batched_strided.cpp index a24a2a188..eb76e01f7 100644 --- a/benchmark/portblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/portblas/blas3/gemm_batched_strided.cpp @@ -60,6 +60,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + state.SkipWithError("Unsupported fp16 (half) on this device."); + } + // Data sizes // Elementary matrices const index_t a_size = m * k; diff --git a/benchmark/portblas/utils.hpp b/benchmark/portblas/utils.hpp index b16bc78b6..97f84409c 100644 --- a/benchmark/portblas/utils.hpp +++ b/benchmark/portblas/utils.hpp @@ -39,7 +39,8 @@ namespace blas_benchmark { // Forward-declaring the function that will create the benchmark -void create_benchmark(Args& args, blas::SB_Handle* sb_handle_ptr, bool* success); +void create_benchmark(Args& args, blas::SB_Handle* sb_handle_ptr, + bool* success); namespace utils { diff --git a/benchmark/rocblas/CMakeLists.txt b/benchmark/rocblas/CMakeLists.txt index 8332590c4..770ba889d 100644 --- a/benchmark/rocblas/CMakeLists.txt +++ b/benchmark/rocblas/CMakeLists.txt @@ -78,7 +78,15 @@ set(sources ) # Operators supporting COMPLEX types benchmarking -set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided") +set(CPLX_OPS "gemm" + "gemm_batched" + "gemm_batched_strided") + +# Operators supporting HALF type benchmarking +set(HALF_DATA_OPS "gemm" + "gemm_batched" + "gemm_batched_strided" +) # Add individual benchmarks for each method foreach(rocblas_benchmark ${sources}) @@ -87,10 +95,12 @@ foreach(rocblas_benchmark ${sources}) target_link_libraries(bench_rocblas_${rocblas_bench_exec} PRIVATE benchmark Clara::Clara roc::rocblas bench_info) target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE}) target_include_directories(bench_rocblas_${rocblas_bench_exec} PRIVATE ${PORTBLAS_INCLUDE} ${rocblas_INCLUDE_DIRS} ${CBLAS_INCLUDE} ${BLAS_BENCH} ${PORTBLAS_COMMON_INCLUDE_DIR}) - if(${BLAS_ENABLE_COMPLEX}) - if("${rocblas_bench_exec}" IN_LIST CPLX_OPS) - target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) - endif() + if((${BLAS_ENABLE_COMPLEX}) AND ("${rocblas_bench_exec}" IN_LIST CPLX_OPS)) + target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) + endif() + + if((${BLAS_ENABLE_HALF}) AND ("${rocblas_bench_exec}" IN_LIST HALF_DATA_OPS)) + target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE BLAS_ENABLE_HALF=1) endif() # Even though rocblas does not use sycl, the common tools indirectly include sycl headers. add_sycl_to_target( diff --git a/benchmark/rocblas/blas3/gemm.cpp b/benchmark/rocblas/blas3/gemm.cpp index ca07ba2ba..8868f44f7 100644 --- a/benchmark/rocblas/blas3/gemm.cpp +++ b/benchmark/rocblas/blas3/gemm.cpp @@ -34,6 +34,8 @@ static inline void rocblas_gemm_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CHECK_ROCBLAS_STATUS(rocblas_hgemm(std::forward(args)...)); } return; } @@ -54,6 +56,10 @@ template void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, bool* success) { + // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise + using rocm_scalar_t = + typename blas_benchmark::utils::RocblasType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -91,81 +97,85 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, std::vector c = blas_benchmark::utils::const_data(c_size, 0); - { - // Device memory allocation & H2D copy - blas_benchmark::utils::HIPVector a_gpu(a_size, a.data()); - blas_benchmark::utils::HIPVector b_gpu(b_size, b.data()); - blas_benchmark::utils::HIPVector c_gpu(c_size, c.data()); + // Device memory allocation & H2D copy + blas_benchmark::utils::HIPVector a_gpu( + a_size, reinterpret_cast(a.data())); + blas_benchmark::utils::HIPVector b_gpu( + b_size, reinterpret_cast(b.data())); + blas_benchmark::utils::HIPVector c_gpu( + c_size, reinterpret_cast(c.data())); + + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK - // Reference gemm - std::vector c_ref = c; - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, a.data(), lda, - b.data(), ldb, beta, c_ref.data(), ldc); + // Reference gemm + std::vector c_ref = c; + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, a.data(), lda, + b.data(), ldb, beta, c_ref.data(), ldc); - // Rocblas verification gemm - std::vector c_temp = c; - { - blas_benchmark::utils::HIPVector c_temp_gpu( - c_size, c_temp.data()); - rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, - &alpha, a_gpu, lda, b_gpu, ldb, &beta, - c_temp_gpu, ldc); - } + // Rocblas verification gemm + std::vector c_temp = c; + { + blas_benchmark::utils::HIPVector c_temp_gpu( + c_size, reinterpret_cast(c_temp.data())); + rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, + &alpha_rocm, a_gpu, lda, b_gpu, ldb, &beta_rocm, + c_temp_gpu, ldc); + } - std::ostringstream err_stream; - if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) { - const std::string& err_str = err_stream.str(); - state.SkipWithError(err_str.c_str()); - *success = false; - }; + std::ostringstream err_stream; + if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; #endif - auto blas_warmup = [&]() -> void { - rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, - &alpha, a_gpu, lda, b_gpu, ldb, &beta, c_gpu, - ldc); - return; - }; - - hipEvent_t start, stop; - CHECK_HIP_ERROR(hipEventCreate(&start)); - CHECK_HIP_ERROR(hipEventCreate(&stop)); - - auto blas_method_def = [&]() -> std::vector { - CHECK_HIP_ERROR(hipEventRecord(start, NULL)); - rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, - &alpha, a_gpu, lda, b_gpu, ldb, &beta, c_gpu, - ldc); - CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - return std::vector{start, stop}; - }; - - // Warmup - blas_benchmark::utils::warmup(blas_warmup); - CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); - - blas_benchmark::utils::init_counters(state); - - // Measure - for (auto _ : state) { - // Run - std::tuple times = - blas_benchmark::utils::timef_hip(blas_method_def); + auto blas_warmup = [&]() -> void { + rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, + &alpha_rocm, a_gpu, lda, b_gpu, ldb, &beta_rocm, + c_gpu, ldc); + return; + }; + + hipEvent_t start, stop; + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); + + auto blas_method_def = [&]() -> std::vector { + CHECK_HIP_ERROR(hipEventRecord(start, NULL)); + rocblas_gemm_f(rb_handle, trans_a_rb, trans_b_rb, m, n, k, + &alpha_rocm, a_gpu, lda, b_gpu, ldb, &beta_rocm, + c_gpu, ldc); + CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + return std::vector{start, stop}; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_warmup); + CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef_hip(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } - // Report - blas_benchmark::utils::update_counters(state, times); - } + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); - state.SetBytesProcessed(state.iterations() * - state.counters["bytes_processed"]); - state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); - - blas_benchmark::utils::calc_avg_counters(state); + blas_benchmark::utils::calc_avg_counters(state); - CHECK_HIP_ERROR(hipEventDestroy(start)); - CHECK_HIP_ERROR(hipEventDestroy(stop)); - } // release device memory via utils::DeviceVector destructors + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); }; template diff --git a/benchmark/rocblas/blas3/gemm_batched.cpp b/benchmark/rocblas/blas3/gemm_batched.cpp index 40147d5ff..f73c176cf 100644 --- a/benchmark/rocblas/blas3/gemm_batched.cpp +++ b/benchmark/rocblas/blas3/gemm_batched.cpp @@ -34,6 +34,8 @@ static inline void rocblas_gemm_batched_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm_batched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm_batched(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CHECK_ROCBLAS_STATUS(rocblas_hgemm_batched(std::forward(args)...)); } return; } @@ -54,6 +56,10 @@ template void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, index_t t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, int batch_type_i, bool* success) { + // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise + using rocm_scalar_t = + typename blas_benchmark::utils::RocblasType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -96,89 +102,90 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, std::vector c = blas_benchmark::utils::const_data(c_size * batch_size, 0); - { - // Device memory allocation & H2D copy - blas_benchmark::utils::HIPVectorBatched a_batched_gpu( - a_size, batch_size, a.data()); - blas_benchmark::utils::HIPVectorBatched b_batched_gpu( - b_size, batch_size, b.data()); - blas_benchmark::utils::HIPVectorBatched c_batched_gpu(c_size, - batch_size); + // Device memory allocation & H2D copy + blas_benchmark::utils::HIPVectorBatched a_batched_gpu( + a_size, batch_size, reinterpret_cast(a.data())); + blas_benchmark::utils::HIPVectorBatched b_batched_gpu( + b_size, batch_size, reinterpret_cast(b.data())); + blas_benchmark::utils::HIPVectorBatched c_batched_gpu( + c_size, batch_size); + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK - // Reference batched gemm - std::vector c_ref = c; - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, - a.data() + batch * a_size, lda, - b.data() + batch * b_size, ldb, beta, - c_ref.data() + batch * c_size, ldc); - } + // Reference batched gemm + std::vector c_ref = c; + for (int batch = 0; batch < batch_size; batch++) { + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, + a.data() + batch * a_size, lda, + b.data() + batch * b_size, ldb, beta, + c_ref.data() + batch * c_size, ldc); + } - // Rocblas verification gemm_batched - std::vector c_temp = c; - { - blas_benchmark::utils::HIPVectorBatched c_temp_gpu( - c_size, batch_size, c_temp.data()); - rocblas_gemm_batched_f( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, b_batched_gpu, ldb, &beta, c_temp_gpu, ldc, batch_size); - } + // Rocblas verification + // gemm_batched + std::vector c_temp = c; + { + blas_benchmark::utils::HIPVectorBatched c_temp_gpu( + c_size, batch_size, reinterpret_cast(c_temp.data())); + rocblas_gemm_batched_f( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, b_batched_gpu, ldb, &beta_rocm, c_temp_gpu, ldc, batch_size); + } - std::ostringstream err_stream; - if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) { - const std::string& err_str = err_stream.str(); - state.SkipWithError(err_str.c_str()); - *success = false; - }; + std::ostringstream err_stream; + if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; #endif - auto blas_warmup = [&]() -> void { - rocblas_gemm_batched_f( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, b_batched_gpu, ldb, &beta, c_batched_gpu, ldc, batch_size); - return; - }; - - hipEvent_t start, stop; - CHECK_HIP_ERROR(hipEventCreate(&start)); - CHECK_HIP_ERROR(hipEventCreate(&stop)); - - auto blas_method_def = [&]() -> std::vector { - CHECK_HIP_ERROR(hipEventRecord(start, NULL)); - rocblas_gemm_batched_f( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, b_batched_gpu, ldb, &beta, c_batched_gpu, ldc, batch_size); - CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - return std::vector{start, stop}; - }; - - // Warmup - blas_benchmark::utils::warmup(blas_warmup); - CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); - - blas_benchmark::utils::init_counters(state); - - // Measure - for (auto _ : state) { - // Run - std::tuple times = - blas_benchmark::utils::timef_hip(blas_method_def); - - // Report - blas_benchmark::utils::update_counters(state, times); - } + auto blas_warmup = [&]() -> void { + rocblas_gemm_batched_f( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, b_batched_gpu, ldb, &beta_rocm, c_batched_gpu, ldc, batch_size); + return; + }; + + hipEvent_t start, stop; + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); + + auto blas_method_def = [&]() -> std::vector { + CHECK_HIP_ERROR(hipEventRecord(start, NULL)); + rocblas_gemm_batched_f( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, b_batched_gpu, ldb, &beta_rocm, c_batched_gpu, ldc, batch_size); + CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + return std::vector{start, stop}; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_warmup); + CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef_hip(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } - state.SetBytesProcessed(state.iterations() * - state.counters["bytes_processed"]); - state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); - blas_benchmark::utils::calc_avg_counters(state); + blas_benchmark::utils::calc_avg_counters(state); - CHECK_HIP_ERROR(hipEventDestroy(start)); - CHECK_HIP_ERROR(hipEventDestroy(stop)); - } // release device memory via utils::DeviceVector destructors + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); }; template diff --git a/benchmark/rocblas/blas3/gemm_batched_strided.cpp b/benchmark/rocblas/blas3/gemm_batched_strided.cpp index 3ecbff82c..61526b246 100644 --- a/benchmark/rocblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/rocblas/blas3/gemm_batched_strided.cpp @@ -36,6 +36,9 @@ static inline void rocblas_gemm_strided_batched(args_t&&... args) { } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS( rocblas_dgemm_strided_batched(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CHECK_ROCBLAS_STATUS( + rocblas_hgemm_strided_batched(std::forward(args)...)); } return; } @@ -59,6 +62,10 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, index_t stride_c_mul, bool* success) { + // scalar_t if scalar_t!=sycl::half, rocblas_half otherwise + using rocm_scalar_t = + typename blas_benchmark::utils::RocblasType::type; + // initialize the state label blas_benchmark::utils::set_benchmark_label(state); @@ -110,93 +117,94 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, std::vector c = blas_benchmark::utils::const_data(size_c_batch, 0); - { - // Device memory allocation & H2D copy - blas_benchmark::utils::HIPVectorBatchedStrided a_batched_gpu( - a_size, batch_size, stride_a, a.data()); - blas_benchmark::utils::HIPVectorBatchedStrided b_batched_gpu( - b_size, batch_size, stride_b, b.data()); - blas_benchmark::utils::HIPVectorBatchedStrided c_batched_gpu( - c_size, batch_size, stride_c, c.data()); + // Device memory allocation & H2D copy + blas_benchmark::utils::HIPVectorBatchedStrided a_batched_gpu( + a_size, batch_size, stride_a, reinterpret_cast(a.data())); + blas_benchmark::utils::HIPVectorBatchedStrided b_batched_gpu( + b_size, batch_size, stride_b, reinterpret_cast(b.data())); + blas_benchmark::utils::HIPVectorBatchedStrided c_batched_gpu( + c_size, batch_size, stride_c, reinterpret_cast(c.data())); + rocm_scalar_t alpha_rocm = *reinterpret_cast(&alpha); + rocm_scalar_t beta_rocm = *reinterpret_cast(&beta); #ifdef BLAS_VERIFY_BENCHMARK - // Reference gemm batched strided (strided loop of gemm) - std::vector c_ref = c; - for (int batch = 0; batch < batch_size; batch++) { - reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, - a.data() + batch * stride_a, lda, - b.data() + batch * stride_b, ldb, beta, - c_ref.data() + batch * stride_c, ldc); - } + // Reference gemm batched strided (strided loop of gemm) + std::vector c_ref = c; + for (int batch = 0; batch < batch_size; batch++) { + reference_blas::gemm(t_a_str, t_b_str, m, n, k, alpha, + a.data() + batch * stride_a, lda, + b.data() + batch * stride_b, ldb, beta, + c_ref.data() + batch * stride_c, ldc); + } - // Rocblas verification gemm_batched_strided - std::vector c_temp = c; - { - blas_benchmark::utils::HIPVectorBatchedStrided c_temp_gpu( - c_size, batch_size, stride_c, c_temp.data()); - rocblas_gemm_strided_batched( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, stride_a, b_batched_gpu, ldb, stride_b, &beta, c_temp_gpu, ldc, - stride_c, batch_size); - } + // Rocblas verification gemm_batched_strided + std::vector c_temp = c; + { + blas_benchmark::utils::HIPVectorBatchedStrided + c_temp_gpu(c_size, batch_size, stride_c, + reinterpret_cast(c_temp.data())); + rocblas_gemm_strided_batched( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, stride_a, b_batched_gpu, ldb, stride_b, &beta_rocm, c_temp_gpu, + ldc, stride_c, batch_size); + } - std::ostringstream err_stream; - if (!utils::compare_vectors_strided(c_temp, c_ref, stride_c, c_size, - err_stream, "")) { - const std::string& err_str = err_stream.str(); - state.SkipWithError(err_str.c_str()); - *success = false; - }; + std::ostringstream err_stream; + if (!utils::compare_vectors_strided(c_temp, c_ref, stride_c, c_size, + err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; #endif - auto blas_warmup = [&]() -> void { - rocblas_gemm_strided_batched( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, stride_a, b_batched_gpu, ldb, stride_b, &beta, c_batched_gpu, - ldc, stride_c, batch_size); - return; - }; - - hipEvent_t start, stop; - CHECK_HIP_ERROR(hipEventCreate(&start)); - CHECK_HIP_ERROR(hipEventCreate(&stop)); - - auto blas_method_def = [&]() -> std::vector { - CHECK_HIP_ERROR(hipEventRecord(start, NULL)); - rocblas_gemm_strided_batched( - rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha, a_batched_gpu, - lda, stride_a, b_batched_gpu, ldb, stride_b, &beta, c_batched_gpu, - ldc, stride_c, batch_size); - CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - return std::vector{start, stop}; - }; - - // Warmup - blas_benchmark::utils::warmup(blas_warmup); - CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); - - blas_benchmark::utils::init_counters(state); - - // Measure - for (auto _ : state) { - // Run - std::tuple times = - blas_benchmark::utils::timef_hip(blas_method_def); - - // Report - blas_benchmark::utils::update_counters(state, times); - } + auto blas_warmup = [&]() -> void { + rocblas_gemm_strided_batched( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, stride_a, b_batched_gpu, ldb, stride_b, &beta_rocm, c_batched_gpu, + ldc, stride_c, batch_size); + return; + }; + + hipEvent_t start, stop; + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); + + auto blas_method_def = [&]() -> std::vector { + CHECK_HIP_ERROR(hipEventRecord(start, NULL)); + rocblas_gemm_strided_batched( + rb_handle, trans_a_rb, trans_b_rb, m, n, k, &alpha_rocm, a_batched_gpu, + lda, stride_a, b_batched_gpu, ldb, stride_b, &beta_rocm, c_batched_gpu, + ldc, stride_c, batch_size); + CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + return std::vector{start, stop}; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_warmup); + CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef_hip(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } - state.SetBytesProcessed(state.iterations() * - state.counters["bytes_processed"]); - state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); - blas_benchmark::utils::calc_avg_counters(state); + blas_benchmark::utils::calc_avg_counters(state); - CHECK_HIP_ERROR(hipEventDestroy(start)); - CHECK_HIP_ERROR(hipEventDestroy(stop)); - } // release device memory via utils::DeviceVector destructors + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); }; template diff --git a/benchmark/rocblas/utils.hpp b/benchmark/rocblas/utils.hpp index d88832225..e5637e1bc 100644 --- a/benchmark/rocblas/utils.hpp +++ b/benchmark/rocblas/utils.hpp @@ -29,8 +29,9 @@ #include "portblas.h" #include +#include #include -#include +#include #ifndef CHECK_HIP_ERROR #define CHECK_HIP_ERROR(error) \ @@ -254,8 +255,8 @@ class HIPVectorBatched : private HIPDeviceMemory { } // Decay into device array pointer wherever pointer is expected - operator T* *() { return d_batch_data_; } - operator const T* *() const { return d_batch_data_; } + operator T**() { return d_batch_data_; } + operator const T**() const { return d_batch_data_; } // Disallow copying or assigning HIPVectorBatched(const HIPVectorBatched&) = delete; @@ -373,6 +374,20 @@ static inline std::tuple timef_hip(function_t func, return std::make_tuple(overall_time, static_cast(elapsed_time) * 1E6); } +/** + * Reference type of the underlying benchmark data aimed to match the + * rocm/rocBLAS scalar types. + */ +template +struct RocblasType { + using type = T; +}; + +// When T is sycl::half, use rocBLAS's rocblas_half as type. +template +struct RocblasType>> { + using type = rocblas_half; +}; } // namespace utils } // namespace blas_benchmark diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index beae299e1..2825b3a92 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -77,7 +77,15 @@ function(sanitize_file_name output file_name) endfunction() #List of operators supporting Complex Data types -set(COMPLEX_OPS "gemm" "gemm_launcher" "scal") +set(COMPLEX_OPS "gemm" + "gemm_launcher" + "scal") + +#List of operators supporting Half Data types +set(HALF_DATA_OPS "axpy" + "scal" + "gemm" + "gemm_launcher") function(set_target_compile_def in_target) #setting compiler flag for backend @@ -118,6 +126,12 @@ function(set_target_compile_def in_target) target_compile_definitions(${in_target} PUBLIC BLAS_ENABLE_COMPLEX=1) endif() endif() + if(${BLAS_ENABLE_HALF}) + if("${in_target}" IN_LIST HALF_DATA_OPS) + message(STATUS "Half Data type support enabled for target ${in_target}") + target_compile_definitions(${in_target} PUBLIC BLAS_ENABLE_HALF=1) + endif() + endif() endfunction() # blas unary function for generating source code @@ -131,6 +145,13 @@ function(generate_blas_objects blas_level func) set_complex_list(data_list_c "${data_list}" "true") endif() endif() + # Extend data_list with 'half' if target function is + # in HALF_DATA_OPS + if(BLAS_ENABLE_HALF) + if("${func}" IN_LIST HALF_DATA_OPS) + list(APPEND data_list_c "half") + endif() + endif() foreach(data ${data_list_c}) cpp_type(cpp_data ${data}) foreach(index ${index_list}) @@ -276,6 +297,9 @@ function(add_gemm_configuration if(BLAS_ENABLE_COMPLEX) set_complex_list(data_list_c "${data_list}" "true") endif() + if(BLAS_ENABLE_HALF) + list(APPEND data_list_c "half") + endif() if(NOT ("${data}" IN_LIST data_list_c)) # Data type not enabled, skip configuration return() @@ -291,7 +315,7 @@ function(add_gemm_configuration cpp_type(cpp_data ${data}) foreach(symm_a ${boolean_list}) foreach(symm_b ${boolean_list}) - if ((${data} MATCHES "complex") AND (symm_a OR symm_b)) + if ((${data} MATCHES "half") AND (symm_a OR symm_b)) continue() endif() if (symm_a AND symm_b) @@ -374,42 +398,31 @@ if(${TUNING_TARGET} STREQUAL "INTEL_GPU") set(supported_types "float" "double" - "half" ) foreach(data ${supported_types}) - add_gemm_configuration( - "${data}" 64 "true" "false" "false" - 64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 4 "strided" "false") add_gemm_configuration( "${data}" 64 "false" "false" "false" - 64 4 8 16 8 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 4 "strided" "false") + 64 4 4 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false") + add_gemm_configuration( - "${data}" 64 "false" "false" "false" - 64 8 8 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "partial" 4 "strided" "false") - - if (${data} STREQUAL "half") - add_gemm_configuration( - "${data}" 16 "true" "false" "false" - 64 1 1 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - add_gemm_configuration( - "${data}" 16 "true" "false" "false" - 64 2 2 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - else() - add_gemm_configuration( - "${data}" 16 "true" "false" "false" - 64 1 1 4 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - add_gemm_configuration( - "${data}" 16 "true" "false" "false" - 64 2 2 4 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - endif() - + "${data}" 32 "true" "true" "true" + 64 2 1 8 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") + add_gemm_configuration( + "${data}" 16 "true" "false" "false" + 64 1 1 4 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") + add_gemm_configuration( + "${data}" 32 "true" "true" "true" + 64 2 2 8 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") + add_gemm_configuration( + "${data}" 16 "true" "false" "false" + 64 2 2 4 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") add_gemm_configuration( "${data}" 64 "true" "true" "true" 64 2 2 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") add_gemm_configuration( - "${data}" 64 "true" "true" "true" - 64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - + "${data}" 64 "true" "false" "false" + 64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 4 "strided" "false") + if (${data} STREQUAL "double") add_gemm_configuration( "${data}" 256 "true" "true" "true" @@ -421,16 +434,27 @@ if(${TUNING_TARGET} STREQUAL "INTEL_GPU") endif() add_gemm_configuration( - "${data}" 32 "true" "true" "true" - 64 2 1 8 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") + "${data}" 64 "true" "true" "true" + 64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") add_gemm_configuration( - "${data}" 32 "true" "true" "true" - 64 2 2 8 4 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") - + "${data}" 64 "false" "false" "false" + 64 8 8 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "partial" 4 "strided" "false") add_gemm_configuration( "${data}" 64 "false" "false" "false" - 64 4 4 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false") + 64 4 8 16 8 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 4 "strided" "false") endforeach() + if(BLAS_ENABLE_HALF) + add_gemm_configuration( + "half" 64 "false" "false" "false" + 64 4 4 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false") + add_gemm_configuration( + "half" 16 "true" "false" "false" + 64 2 2 8 8 1 1 1 1 1 1 1 1 1 float float "local" "tall_skinny" "none" 4 "strided" "false") + add_gemm_configuration( + "half" 64 "false" "false" "false" + 64 4 8 16 8 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 4 "strided" "false") + endif() + if(BLAS_ENABLE_COMPLEX) # Extract list of complex for each data in supported_types # list for complex specific gemm configurations @@ -486,8 +510,10 @@ elseif(${TUNING_TARGET} STREQUAL "AMD_GPU") # need investigation set(supported_types "float" "double" - "half" ) + if(BLAS_ENABLE_HALF) + list(APPEND supported_types "half") + endif() set(workgroup_float 16) set(workgroup_double 8) set(workgroup_half 32) @@ -634,6 +660,19 @@ elseif(${TUNING_TARGET} STREQUAL "NVIDIA_GPU") 64 2 2 16 16 1 1 2 2 1 1 1 1 1 float float "local" "standard" "full" 1 "strided" "false") endforeach() endif() # BLAS_ENABLE_COMPLEX + + if(BLAS_ENABLE_HALF) + add_gemm_configuration( + "half" 64 "false" "false" "false" + 64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false") + add_gemm_configuration( + "half" 256 "false" "true" "true" + 128 4 4 16 16 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 1 "strided" "false") + add_gemm_configuration( + "half" 256 "false" "true" "true" + 128 8 8 16 16 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 1 "strided" "false") + endif() # BLAS_ENABLE_HALF + else() # default cpu backend set(supported_types "float" @@ -663,8 +702,8 @@ else() # default cpu backend add_gemm_configuration( "${data}" 64 "false" "false" "false" 64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false") - endforeach() + if(BLAS_ENABLE_COMPLEX) # Extract list of complex for each data in supported_types # list for complex specific gemm configurations diff --git a/cmake/Modules/ConfigurePORTBLAS.cmake b/cmake/Modules/ConfigurePORTBLAS.cmake index 5112e9d02..a66eebfed 100644 --- a/cmake/Modules/ConfigurePORTBLAS.cmake +++ b/cmake/Modules/ConfigurePORTBLAS.cmake @@ -49,15 +49,6 @@ if("double" IN_LIST BLAS_DATA_TYPES) add_definitions(-DBLAS_DATA_TYPE_DOUBLE) endif() -if("half" IN_LIST BLAS_DATA_TYPES) - add_definitions(-DBLAS_DATA_TYPE_HALF) -endif() - -# If the user has specified a specific workgroup size for tests, pass that on to the compiler -if(WG_SIZE) - add_definitions(-DWG_SIZE=${WG_SIZE}) -endif() - # If the user has specified that we should use naive gemm, enable that option(NAIVE_GEMM "Default to naive GEMM implementations" off) if(NAIVE_GEMM) diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index 9a1354346..cc9496581 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -1626,6 +1626,11 @@ inline std::string get_type_name() { return "double"; } +template <> +inline std::string get_type_name() { + return "half"; +} + #ifdef BLAS_ENABLE_COMPLEX template <> inline std::string get_type_name>() { @@ -1656,7 +1661,10 @@ template static inline scalar_t random_scalar(scalar_t rangeMin, scalar_t rangeMax) { static std::random_device rd; static std::default_random_engine gen(rd()); - std::uniform_real_distribution dis(rangeMin, rangeMax); + using random_scalar_t = + std::conditional_t, float, + scalar_t>; + std::uniform_real_distribution dis(rangeMin, rangeMax); return dis(gen); } @@ -1929,7 +1937,7 @@ static inline void calc_avg_counters(benchmark::State& state) { #define BLAS_REGISTER_BENCHMARK_DOUBLE(args, sb_handle_ptr, success) #endif // BLAS_DATA_TYPE_DOUBLE -#ifdef BLAS_DATA_TYPE_HALF +#ifdef BLAS_ENABLE_HALF /** Registers benchmark for the cl::sycl::half data type * @see BLAS_REGISTER_BENCHMARK */ @@ -1937,7 +1945,7 @@ static inline void calc_avg_counters(benchmark::State& state) { register_benchmark(args, sb_handle_ptr, success) #else #define BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success) -#endif // BLAS_DATA_TYPE_HALF +#endif // BLAS_ENABLE_HALF #ifdef BLAS_ENABLE_COMPLEX /** Registers benchmark for the float complex data type diff --git a/common/include/common/float_comparison.hpp b/common/include/common/float_comparison.hpp index 1222ccc41..3cfc0885e 100644 --- a/common/include/common/float_comparison.hpp +++ b/common/include/common/float_comparison.hpp @@ -32,7 +32,6 @@ #include #endif -#ifdef BLAS_DATA_TYPE_HALF #if SYCL_LANGUAGE_VERSION < 202000 #include inline std::ostream& operator<<(std::ostream& os, const cl::sycl::half& value) { @@ -49,7 +48,6 @@ class numeric_limits { }; } // namespace std #endif // SYCL_LANGUAGE_VERSION -#endif // BLAS_DATA_TYPE_HALF namespace utils { @@ -85,7 +83,6 @@ scalar_t abs(std::complex value) noexcept { } #endif -#ifdef BLAS_DATA_TYPE_HALF template <> inline bool isnan(cl::sycl::half value) noexcept { return std::isnan(static_cast(value)); @@ -101,8 +98,6 @@ inline cl::sycl::half abs(cl::sycl::half value) noexcept { return std::abs(static_cast(value)); } -#endif // BLAS_DATA_TYPE_HALF - template scalar_t clamp_to_limits(scalar_t v) { constexpr auto min_value = std::numeric_limits::min(); @@ -139,14 +134,12 @@ inline double getRelativeErrorMargin() { return 0.0000000001; // 10^-10 } -#ifdef BLAS_DATA_TYPE_HALF - template <> inline cl::sycl::half getRelativeErrorMargin() { // Measured empirically with gemm return 0.05f; } -#endif + /** * Indicates the tolerated margin for absolute differences (used in case the * scalars are close to 0) @@ -168,14 +161,12 @@ inline double getAbsoluteErrorMargin() { */ return 0.0000000001; // 10^-10 } -#ifdef BLAS_DATA_TYPE_HALF template <> inline cl::sycl::half getAbsoluteErrorMargin() { // Measured empirically with gemm. return 1.0f; } -#endif /** * Compare two scalars and returns false if the difference is not acceptable. @@ -208,7 +199,7 @@ inline bool almost_equal(scalar_t const& scalar1, scalar_t const& scalar2) { * Compare two vectors and returns false if the difference is not acceptable. * The second vector is considered the reference. * @tparam scalar_t the type of data present in the input vectors - * @tparam epilon_t the type used as tolerance. Lower precision types + * @tparam epsilon_t the type used as tolerance. Lower precision types * (cl::sycl::half) will have a higher tolerance for errors */ template @@ -238,7 +229,7 @@ inline bool compare_vectors(std::vector const& vec, * not acceptable. The second vector is considered the reference. * @tparam scalar_t the type of complex underying data present in the input * vectors - * @tparam epilon_t the type used as tolerance. + * @tparam epsilon_t the type used as tolerance. */ template inline bool compare_vectors(std::vector> const& vec, diff --git a/common/include/common/set_benchmark_label.hpp b/common/include/common/set_benchmark_label.hpp index 9495a3195..c684b3bc5 100644 --- a/common/include/common/set_benchmark_label.hpp +++ b/common/include/common/set_benchmark_label.hpp @@ -174,13 +174,13 @@ inline void add_datatype_info( } #endif -#ifdef BLAS_DATA_TYPE_HALF +#ifdef BLAS_ENABLE_HALF template <> inline void add_datatype_info( std::map& key_value_map) { key_value_map["@datatype"] = "half"; } -#endif // BLAS_DATA_TYPE_HALF +#endif // BLAS_ENABLE_HALF #ifdef BLAS_ENABLE_COMPLEX template <> diff --git a/common/include/common/system_reference_blas.hpp b/common/include/common/system_reference_blas.hpp index cd07e27cf..dc98f120e 100644 --- a/common/include/common/system_reference_blas.hpp +++ b/common/include/common/system_reference_blas.hpp @@ -151,8 +151,22 @@ scalar_t asum(const int n, scalar_t x[], const int incX) { template void axpy(const int n, scalar_t alpha, const scalar_t x[], const int incX, scalar_t y[], const int incY) { - auto func = blas_system_function(&cblas_saxpy, &cblas_daxpy); - func(n, alpha, x, incX, y, incY); + if constexpr (!std::is_same_v) { + auto func = blas_system_function(&cblas_saxpy, &cblas_daxpy); + func(n, alpha, x, incX, y, incY); + } else { + // Casting scalar half values to float in order to call reference library. + int x_size = n * std::abs(incX); + int y_size = n * std::abs(incY); + float alpha_f = alpha; + std::vector x_f(x_size); + std::vector y_f(y_size); + for (int i = 0; i < x_size; ++i) x_f[i] = static_cast(x[i]); + for (int i = 0; i < y_size; ++i) y_f[i] = static_cast(y[i]); + + cblas_saxpy(n, alpha_f, x_f.data(), incX, y_f.data(), incY); + for (int i = 0; i < y_size; ++i) y[i] = static_cast(y_f[i]); + } } template @@ -224,8 +238,18 @@ void rotmg(scalar_t *d1, scalar_t *d2, scalar_t *x1, scalar_t *y1, template void scal(const int n, const scalar_t alpha, scalar_t x[], const int incX) { - auto func = blas_system_function(&cblas_sscal, &cblas_dscal); - func(n, alpha, x, incX); + if constexpr (!std::is_same_v) { + auto func = blas_system_function(&cblas_sscal, &cblas_dscal); + func(n, alpha, x, incX); + } else { + // Casting scalar half values to float in order to call reference library. + int size = n * std::abs(incX); + float alpha_f = alpha; + std::vector x_f(size); + for (int i = 0; i < size; ++i) x_f[i] = static_cast(x[i]); + cblas_sscal(n, alpha_f, x_f.data(), incX); + for (int i = 0; i < size; ++i) x[i] = static_cast(x_f[i]); + } } template @@ -379,9 +403,31 @@ template void gemm(const char *transA, const char *transB, int m, int n, int k, scalar_t alpha, const scalar_t a[], int lda, const scalar_t b[], int ldb, scalar_t beta, scalar_t c[], int ldc) { - auto func = blas_system_function(&cblas_sgemm, &cblas_dgemm); - func(CblasColMajor, c_trans(*transA), c_trans(*transB), m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc); + if constexpr (!std::is_same_v) { + auto func = blas_system_function(&cblas_sgemm, &cblas_dgemm); + func(CblasColMajor, c_trans(*transA), c_trans(*transB), m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); + } else { + // Casting scalar half values to float in order to call reference library. + int a_size = (transA[0] != 'n') ? lda * m : lda * k; + int b_size = (transB[0] != 'n') ? ldb * k : ldb * n; + int c_size = ldc * n; + float alpha_f = alpha; + float beta_f = beta; + + std::vector a_f(a_size); + std::vector b_f(b_size); + std::vector c_f(c_size); + + for (int i = 0; i < a_size; ++i) a_f[i] = static_cast(a[i]); + for (int i = 0; i < b_size; ++i) b_f[i] = static_cast(b[i]); + for (int i = 0; i < c_size; ++i) c_f[i] = static_cast(c[i]); + + cblas_sgemm(CblasColMajor, c_trans(*transA), c_trans(*transB), m, n, k, + alpha_f, a_f.data(), lda, b_f.data(), ldb, beta_f, c_f.data(), + ldc); + for (int i = 0; i < c_size; ++i) c[i] = static_cast(c_f[i]); + } } template diff --git a/include/blas_meta.h b/include/blas_meta.h index f2f641ca2..b80639128 100644 --- a/include/blas_meta.h +++ b/include/blas_meta.h @@ -171,7 +171,7 @@ int append_vector(vector_t &lhs_vector, vector_t const &rhs_vector) { template first_vector_t concatenate_vectors(first_vector_t first_vector, - other_vector_t &&... other_vectors) { + other_vector_t &&...other_vectors) { int first_Vector_size = static_cast(first_vector.size()); int s[] = {vec_total_size(first_Vector_size, other_vectors)..., 0}; first_vector.reserve(first_Vector_size); @@ -199,6 +199,10 @@ struct is_sycl_scalar : std::false_type {}; template <> struct is_sycl_scalar : std::false_type {}; +template +struct is_half + : std::integral_constant> {}; + #ifdef BLAS_ENABLE_COMPLEX // SYCL Complex type alias template diff --git a/include/operations/blas_constants.h b/include/operations/blas_constants.h index 976e76284..229cd6721 100644 --- a/include/operations/blas_constants.h +++ b/include/operations/blas_constants.h @@ -212,7 +212,6 @@ struct constant, Indicator> { }; #endif -#ifdef BLAS_DATA_TYPE_HALF template <> struct constant : constant {}; @@ -252,7 +251,6 @@ struct constant template <> struct constant : constant {}; -#endif // BLAS_DATA_TYPE_HALF template struct constant_pair { diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp index e0b519e61..33f50539f 100644 --- a/src/interface/blas3/backend/default_cpu.hpp +++ b/src/interface/blas3/backend/default_cpu.hpp @@ -33,7 +33,8 @@ namespace backend { template -typename std::enable_if::value, +typename std::enable_if::value && + !is_half::value, typename sb_handle_t::event_t>::type _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp index 77b7e232f..bf56e684e 100644 --- a/src/interface/blas3/backend/intel_gpu.hpp +++ b/src/interface/blas3/backend/intel_gpu.hpp @@ -32,7 +32,8 @@ namespace backend { template -typename std::enable_if::value, +typename std::enable_if::value && + !is_half::value, typename sb_handle_t::event_t>::type _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, @@ -73,12 +74,9 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, batch_size, _dependencies); } else if (_M <= 4 || _N <= 4) { - // Need to increase the work group size for cl::sycl::half for the - // launcher to be instancianted - constexpr int wg_size = sizeof(element_t) == 2 ? 8 : 4; return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 16, true, false, - false, 64, Tile<1, 1, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + false, 64, Tile<1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, @@ -98,12 +96,9 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, batch_size, _dependencies); } else if (_M <= 8 || _N <= 8) { - // Need to increase the work group size for cl::sycl::half for the - // launcher to be instancianted - constexpr int wg_size = sizeof(element_t) == 2 ? 8 : 4; return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 16, true, false, - false, 64, Tile<2, 2, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + false, 64, Tile<2, 2, 4, 4>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, @@ -134,6 +129,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, batch_size, _dependencies); } else { + // Need to increase the work group size for double for the + // launcher to be instancianted constexpr int wg_size = sizeof(element_t) == 8 ? 8 : 16; return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 256, true, true, @@ -213,6 +210,66 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, } } +// Half Configurations +template +typename std::enable_if::value, + typename sb_handle_t::event_t>::type +_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, + element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, + container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, + container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { + // Unused configuration cases + if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { + return _dependencies; + } else { + if (batch_type == gemm_batch_type_t::interleaved) { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 4, + static_cast(gemm_batch_type_t::interleaved)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, + _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, + _stridec, batch_size, _dependencies); + } else { +#ifdef GEMM_TALL_SKINNY_SUPPORT + if (!s_a && !s_b) { + // Tall & Skinny matrices + if (batch_size == 1 && ((_K >= 4096 && _M * _N <= 16384) || + (_K >= 1024 && _M * _N <= 4096))) { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 16, true, false, + false, 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), + static_cast(gemm_algorithm_t::tall_skinny), + static_cast(gemm_vectorization_t::none), is_beta_zero, 4, + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, + _stridea, _b, _ldb, _strideb, _beta, _c, + _ldc, _stridec, batch_size, _dependencies); + } + } +#endif + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 8, 16, 8>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 4, + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, + _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, + _stridec, batch_size, _dependencies); + } + } +} + // Complex Configurations #ifdef BLAS_ENABLE_COMPLEX template -typename std::enable_if::value, +typename std::enable_if::value && + !is_half::value, typename sb_handle_t::event_t>::type _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, @@ -175,6 +176,63 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, } } +// Half Configurations +template +typename std::enable_if::value, + typename sb_handle_t::event_t>::type +_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, + element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, + container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, + container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { + // Unused configuration cases + if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { + return _dependencies; + } else { + if (batch_type == gemm_batch_type_t::interleaved) { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4, 1, 1, 1, float, float>, _t_a, + _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 4, + static_cast(gemm_batch_type_t::interleaved)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, + _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, + _stridec, batch_size, _dependencies); + } else { + if (_M <= 1024 && _N <= 1024) { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 256, false, true, true, + 128, Tile<4, 4, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, float, float>, + _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 1, + static_cast(gemm_batch_type_t::strided), + false>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, + _lda, _stridea, _b, _ldb, _strideb, + _beta, _c, _ldc, _stridec, batch_size, + _dependencies); + } else { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 256, false, true, true, + 128, Tile<8, 8, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, float, float>, + _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 1, + static_cast(gemm_batch_type_t::strided), + false>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, + _lda, _stridea, _b, _ldb, _strideb, + _beta, _c, _ldc, _stridec, batch_size, + _dependencies); + } + } + } +} + // Complex Configurations #ifdef BLAS_ENABLE_COMPLEX template #include @@ -48,7 +48,7 @@ namespace blas { */ namespace internal { -// Check whether value is zero (complex & float/double) +// Check whether value is zero (complex & half/float/double) template inline typename std::enable_if::value, bool>::type isZero( const T& value) { diff --git a/src/operations/blas1_trees.hpp b/src/operations/blas1_trees.hpp index 5dfcefa7d..01b094ebc 100644 --- a/src/operations/blas1_trees.hpp +++ b/src/operations/blas1_trees.hpp @@ -80,7 +80,6 @@ struct DetectScalar { static element_t get_scalar(element_t &scalar) { return scalar; } }; -#ifdef BLAS_DATA_TYPE_HALF /*! DetectScalar. * @brief See Detect Scalar. */ @@ -89,7 +88,6 @@ struct DetectScalar { using element_t = cl::sycl::half; static element_t get_scalar(element_t &scalar) { return scalar; } }; -#endif // BLAS_DATA_TYPE_HALF #ifdef BLAS_ENABLE_COMPLEX /*! DetectScalar (for sycl::complex) diff --git a/src/operations/blas3/gemm_local.hpp b/src/operations/blas3/gemm_local.hpp index 0ca182918..ff6cb9f7a 100644 --- a/src/operations/blas3/gemm_local.hpp +++ b/src/operations/blas3/gemm_local.hpp @@ -147,7 +147,7 @@ class Gemm::value) || - is_sycl_scalar::value, + !is_complex_sycl::value, "Vector size should be equal to 1 for Complex Data types"); #endif @@ -763,7 +763,7 @@ class Gemm(reg_a[l], reg_b, reg_res[j * item_rows + l]); + mul_add(reg_a[l], reg_b, reg_res[j * item_rows + l]); } } A = A + ldsa; diff --git a/src/operations/blas3/gemm_no_local_full_vec.hpp b/src/operations/blas3/gemm_no_local_full_vec.hpp index 77cbafbbf..fc447baa2 100644 --- a/src/operations/blas3/gemm_no_local_full_vec.hpp +++ b/src/operations/blas3/gemm_no_local_full_vec.hpp @@ -109,7 +109,7 @@ class Gemm::value) || - is_sycl_scalar::value, + !is_complex_sycl::value, "Vector size should be equal to 1 for Complex Data types"); #endif @@ -857,7 +857,7 @@ class Gemm(reg_a[j], reg_b[i], reg_res[i * item_rows + j]); + mul_add(reg_a[j], reg_b[i], reg_res[i * item_rows + j]); } } } @@ -880,7 +880,7 @@ class Gemm(reg_a[j], *reg_b, reg_res[j]); + reg_res[j] = mul_add(reg_a[j], *reg_b, reg_res[j]); } } diff --git a/src/operations/blas3/gemm_no_local_partial_vec.hpp b/src/operations/blas3/gemm_no_local_partial_vec.hpp index ba26ef67f..66b51bd9d 100644 --- a/src/operations/blas3/gemm_no_local_partial_vec.hpp +++ b/src/operations/blas3/gemm_no_local_partial_vec.hpp @@ -105,7 +105,7 @@ class Gemm::value) || - is_sycl_scalar::value, + !is_complex_sycl::value, "Vector size should be equal to 1 for Complex Data types"); #endif @@ -500,7 +500,7 @@ class Gemm(reg_a[j], reg_b[i], reg_res[i * item_rows + j]); + mul_add(reg_a[j], reg_b[i], reg_res[i * item_rows + j]); } } } diff --git a/src/operations/blas3/gemm_partial_local.hpp b/src/operations/blas3/gemm_partial_local.hpp index a6f8bf30a..6e61ba3c4 100644 --- a/src/operations/blas3/gemm_partial_local.hpp +++ b/src/operations/blas3/gemm_partial_local.hpp @@ -189,8 +189,8 @@ class GemmPartial PORTBLAS_INLINE void eval(local_memory_t scratch, - cl::sycl::nd_item<1> id) noexcept { + cl::sycl::nd_item<1> id) noexcept { /* Pointers to the scratch memory (lhs and rhs) */ value_t* scratch_ptr = scratch.localAcc.get_pointer(); value_t* rhs_scratch_ptr = scratch_ptr + rhs_scratch_offset; @@ -309,8 +309,8 @@ class GemmPartial( - privateLhs, privateRhs, private_res[wLPTM + idx]); + private_res[wLPTM + idx] = + mul_add(privateLhs, privateRhs, private_res[wLPTM + idx]); lhs_index += tile_type::wg_rows; } diff --git a/src/operations/blas_operators.hpp b/src/operations/blas_operators.hpp index a2960f300..ae98f763b 100644 --- a/src/operations/blas_operators.hpp +++ b/src/operations/blas_operators.hpp @@ -110,7 +110,7 @@ struct AbsoluteValue { template using stripped_t = typename StripASP::type; -#ifdef BLAS_DATA_TYPE_HALF +#ifdef BLAS_ENABLE_HALF template using is_floating_point = std::integral_constant< bool, std::is_floating_point>::value || @@ -118,7 +118,7 @@ struct AbsoluteValue { #else template using is_floating_point = std::is_floating_point; -#endif // BLAS_DATA_TYPE_HALF +#endif // BLAS_ENABLE_HALF template static PORTBLAS_INLINE value_t eval( diff --git a/test/blas_test.hpp b/test/blas_test.hpp index d159109db..5d0193518 100644 --- a/test/blas_test.hpp +++ b/test/blas_test.hpp @@ -120,7 +120,10 @@ template static inline scalar_t random_scalar(scalar_t rangeMin, scalar_t rangeMax) { static std::random_device rd; static std::default_random_engine gen(rd()); - std::uniform_real_distribution dis(rangeMin, rangeMax); + using random_scalar_t = + std::conditional_t, float, + scalar_t>; + std::uniform_real_distribution dis(rangeMin, rangeMax); return dis(gen); } diff --git a/test/blas_test_macros.hpp b/test/blas_test_macros.hpp index 89e733e60..64a8e4b32 100644 --- a/test/blas_test_macros.hpp +++ b/test/blas_test_macros.hpp @@ -72,7 +72,7 @@ combination, name_generator) #endif // BLAS_DATA_TYPE_DOUBLE -#ifdef BLAS_DATA_TYPE_HALF +#ifdef BLAS_ENABLE_HALF /** Registers test for the cl::sycl::half type * @see BLAS_REGISTER_TEST_CUSTOM_NAME */ @@ -91,7 +91,7 @@ #define BLAS_REGISTER_TEST_HALF_CUSTOM_NAME(test_suite, class_name, \ test_function, combination_t, \ combination, name_generator) -#endif // BLAS_DATA_TYPE_HALF +#endif // BLAS_ENABLE_HALF #ifdef BLAS_ENABLE_COMPLEX #define BLAS_REGISTER_TEST_CPLX_S_CUSTOM_NAME(test_suite, class_name, \ diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 4cdcf7197..a871040d4 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -108,6 +108,12 @@ if(GEMM_TALL_SKINNY_SUPPORT) list(APPEND SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_tall_skinny_test.cpp) endif() +set(HALF_DATA_OPS "blas1_axpy_test" + "blas1_scal_test" + "blas3_gemm_test" + "blas3_gemm_batched_test" + ) + foreach(blas_test ${SYCL_UNITTEST_SRCS}) get_filename_component(test_exec ${blas_test} NAME_WE) add_executable(${test_exec} main.cpp ${blas_test}) @@ -122,6 +128,9 @@ foreach(blas_test ${SYCL_UNITTEST_SRCS}) target_compile_definitions(${test_exec} PRIVATE BLAS_ENABLE_COMPLEX=1) endif() endif() + if((${BLAS_ENABLE_HALF}) AND (${test_exec} IN_LIST HALF_DATA_OPS)) + target_compile_definitions(${test_exec} PRIVATE BLAS_ENABLE_HALF=1) + endif() target_compile_definitions(${test_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_TEST_INDEX_TYPE}) target_link_libraries(${test_exec} PRIVATE gtest_main Clara::Clara blas::blas portblas) target_include_directories(${test_exec} PRIVATE ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR}) diff --git a/test/unittest/blas1/blas1_axpy_test.cpp b/test/unittest/blas1/blas1_axpy_test.cpp index 618fa629b..2759cc6c3 100644 --- a/test/unittest/blas1/blas1_axpy_test.cpp +++ b/test/unittest/blas1/blas1_axpy_test.cpp @@ -47,11 +47,17 @@ void run_test(const combination_t combi) { std::vector y_v(y_size, 10.0); std::vector y_cpu_v(y_size, 10.0); + auto q = make_queue(); + + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + GTEST_SKIP() << "Unsupported fp16 (half) on this device."; + } + // Reference implementation reference_blas::axpy(size, alpha, x_v.data(), incX, y_cpu_v.data(), incY); // SYCL implementation - auto q = make_queue(); blas::SB_Handle sb_handle(q); // Iterators diff --git a/test/unittest/blas1/blas1_scal_test.cpp b/test/unittest/blas1/blas1_scal_test.cpp index 421a9090a..bf8c07240 100644 --- a/test/unittest/blas1/blas1_scal_test.cpp +++ b/test/unittest/blas1/blas1_scal_test.cpp @@ -40,11 +40,17 @@ void run_test(const combination_t combi) { std::vector x_v(size * incX); std::vector x_cpu_v(x_v); + auto q = make_queue(); + + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + GTEST_SKIP() << "Unsupported fp16 (half) on this device."; + } + // Reference implementation reference_blas::scal(size, alpha, x_cpu_v.data(), incX); // SYCL implementation - auto q = make_queue(); blas::SB_Handle sb_handle(q); // Iterators diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp index 2cd832a99..5308d70e5 100644 --- a/test/unittest/blas3/blas3_gemm_common.hpp +++ b/test/unittest/blas3/blas3_gemm_common.hpp @@ -109,6 +109,11 @@ inline void verify_gemm(const gemm_arguments_t arguments) { const char tb_str[2] = {transb, '\0'}; auto q = make_queue(); + + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + GTEST_SKIP() << "Unsupported fp16 (half) on this device."; + } blas::SB_Handle sb_handle(q); const index_t lda = ((transa != 'n') ? k : m) * lda_mul; @@ -266,6 +271,12 @@ inline void verify_gemm( const char tb_str[2] = {transb, '\0'}; auto q = make_queue(); + + if (std::is_same_v && + !q.get_device().has(cl::sycl::aspect::fp16)) { + GTEST_SKIP() << "Unsupported fp16 (half) on this device."; + } + blas::SB_Handle sb_handle(q); const index_t lda = ((transa != 'n') ? k : m) * lda_mul;