From 9d515e3c01eaab1bdf53c31234ded16157cc3ce4 Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:45:55 +0000 Subject: [PATCH] Introduce temporary memory pool allocations within SB_Handler (#478) This patch aims to improve performance by removing excessive memory allocation overhead and re-using the existing memory from the pool of allocations available. --- CMakeLists.txt | 2 + README.md | 1 + benchmark/portblas/CMakeLists.txt | 4 + benchmark/portblas/main.cpp | 6 + include/sb_handle/portblas_handle.h | 43 ++++++- include/sb_handle/temp_memory_pool.h | 112 +++++++++++++++++ src/interface/blas1_interface.hpp | 9 +- src/interface/blas2_interface.hpp | 108 +++++++++------- src/interface/trsm_interface.hpp | 25 ++-- src/sb_handle/portblas_handle.hpp | 89 ++++++++++--- src/sb_handle/temp_memory_pool.hpp | 118 ++++++++++++++++++ tools/auto_tuner/CMakeLists.txt | 3 + tools/auto_tuner/README.md | 11 ++ tools/auto_tuner/gen/generate_combinations.py | 2 +- tools/auto_tuner/include/gemm_tuner.hpp | 15 +-- tools/auto_tuner/include/tune.hpp | 2 +- tools/auto_tuner/include/tune_impl.hpp | 3 +- tools/auto_tuner/include/utils.hpp | 10 +- tools/auto_tuner/src/tune_all.cpp | 19 ++- tools/auto_tuner/src/tune_nn.cpp | 12 +- tools/auto_tuner/src/tune_nt.cpp | 12 +- tools/auto_tuner/src/tune_tn.cpp | 12 +- tools/auto_tuner/src/tune_tt.cpp | 12 +- 23 files changed, 521 insertions(+), 109 deletions(-) create mode 100644 include/sb_handle/temp_memory_pool.h create mode 100644 src/sb_handle/temp_memory_pool.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 09785078f..21340430c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,7 @@ endif() option(BLAS_ENABLE_CONST_INPUT "Whether to enable kernel instantiation with const input buffer" ON) option(BLAS_ENABLE_BENCHMARK "Whether to enable benchmarking" ON) option(BLAS_VERIFY_BENCHMARK "Whether to verify the results of benchmarks" ON) +option(BLAS_MEMPOOL_BENCHMARK "Whether to use the memory pool in benchmarks" OFF) option(BUILD_CLBLAST_BENCHMARKS "Whether to build clBLAST benchmarks" OFF) option(BUILD_CLBLAS_BENCHMARKS "Whether to build clBLAS benchmarks" OFF) option(BUILD_CUBLAS_BENCHMARKS "Whether to build cuBLAS benchmarks" OFF) @@ -240,6 +241,7 @@ if (BLAS_BUILD_SAMPLES) endif() option(BLAS_ENABLE_AUTO_TUNERS "Whether to enable building GEMM auto tuners" OFF) +option(BLAS_ENABLE_AUTO_TUNER_MEMPOOL "Whether to enable memory pool for GEMM auto tuners" OFF) if(${BLAS_ENABLE_AUTO_TUNERS}) # Note that the auto tuners are very slow to compile, so we avoid adding # them to the ALL target. diff --git a/README.md b/README.md index 97476e994..bba1a6bc0 100644 --- a/README.md +++ b/README.md @@ -459,6 +459,7 @@ Some of the supported options are: | `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) | | `ENABLE_EXPRESSION_TESTS` | `ON`/`OFF` | Build additional tests that use the header-only framework (e.g to test expression trees); `OFF` by default | | `BLAS_VERIFY_BENCHMARK` | `ON`/`OFF` | Verify the results of the benchmarks instead of only measuring the performance. See the documentation of the benchmarks for more details. `ON` by default | +| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default | | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) | | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) | | `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` | diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 87fc58eaf..8cb498ad6 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -96,6 +96,10 @@ foreach(portblas_bench ${sources}) ) target_include_directories(bench_${bench_exec} PRIVATE ${PORTBLAS_INCLUDE} ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR}) + if(BLAS_MEMPOOL_BENCHMARK) + target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_MEMPOOL_BENCHMARK) + endif() + if(BLAS_VERIFY_BENCHMARK) target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_VERIFY_BENCHMARK) target_link_libraries(bench_${bench_exec} PRIVATE blas::blas) diff --git a/benchmark/portblas/main.cpp b/benchmark/portblas/main.cpp index a5a2e6813..46ed313f1 100644 --- a/benchmark/portblas/main.cpp +++ b/benchmark/portblas/main.cpp @@ -66,8 +66,14 @@ int main(int argc, char** argv) { utils::print_queue_information(q); +#ifdef BLAS_MEMPOOL_BENCHMARK + blas::Temp_Mem_Pool mp(q); + // Create a portBLAS sb_handle from the memory pool + blas::SB_Handle sb_handle(&mp); +#else // Create a portBLAS sb_handle from the queue blas::SB_Handle sb_handle(q); +#endif // This will be set to false by a failing benchmark bool success = true; diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index 08b9a1b61..f7104a0cc 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -31,6 +31,8 @@ #include "operations/blas3_trees.h" #include "operations/extension/reduction.h" #include "portblas_helper.h" +#include "temp_memory_pool.h" + namespace blas { /** SB_Handle. @@ -47,11 +49,49 @@ class SB_Handle { public: using event_t = std::vector; inline SB_Handle(queue_t q) - : q_(q), + : tempMemPool_(nullptr), + q_(q), workGroupSize_(helper::get_work_group_size(q)), localMemorySupport_(helper::has_local_memory(q)), computeUnits_(helper::get_num_compute_units(q)) {} + inline SB_Handle(Temp_Mem_Pool* tmp) + : tempMemPool_(tmp), + q_(tmp->get_queue()), + workGroupSize_(helper::get_work_group_size(q_)), + localMemorySupport_(helper::has_local_memory(q_)), + computeUnits_(helper::get_num_compute_units(q_)) {} + + template + typename std::enable_if< + alloc == helper::AllocType::buffer, + typename helper::AllocHelper::type>::type + acquire_temp_mem(size_t size); + + template + typename std::enable_if< + std::is_same::type, + helper::AllocType::buffer>::type>::value, + typename SB_Handle::event_t>::type + release_temp_mem(const typename SB_Handle::event_t&, const container_t&); + +#ifdef SB_ENABLE_USM + template + typename std::enable_if< + alloc == helper::AllocType::usm, + typename helper::AllocHelper::type>::type + acquire_temp_mem(size_t size); + + template + typename std::enable_if< + std::is_same::type, + helper::AllocType::usm>::type>::value, + typename SB_Handle::event_t>::type + release_temp_mem(const typename SB_Handle::event_t&, const container_t&); +#endif + template event_t execute(expression_tree_t tree, const event_t& dependencies = {}); @@ -151,6 +191,7 @@ class SB_Handle { const size_t workGroupSize_; const bool localMemorySupport_; const size_t computeUnits_; + Temp_Mem_Pool* tempMemPool_; }; } // namespace blas diff --git a/include/sb_handle/temp_memory_pool.h b/include/sb_handle/temp_memory_pool.h new file mode 100644 index 000000000..1cb2a5d59 --- /dev/null +++ b/include/sb_handle/temp_memory_pool.h @@ -0,0 +1,112 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename temp_memory_pool.h + * + **************************************************************************/ +#ifndef TEMP_MEMORY_POOL_H +#define TEMP_MEMORY_POOL_H + +#include +#include + +namespace blas { +class Temp_Mem_Pool { + using queue_t = cl::sycl::queue; + using event_t = std::vector; + using temp_usm_map_t = std::multimap; + using temp_usm_size_map_t = std::map; + using temp_buffer_map_t = std::multimap>; + + public: + Temp_Mem_Pool(queue_t q) + : q_(q), + temp_buffer_map_tot_byte_size_(0), + temp_usm_map_tot_byte_size_(0) {} + Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete; + Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete; + + ~Temp_Mem_Pool() { + // Wait for the completion of all the host tasks + q_.wait(); + +#ifdef VERBOSE + std::cout << "# buffers destroyed on memory pool destruction: " + << temp_buffer_map_.size() << " (" + << temp_buffer_map_tot_byte_size_ << " bytes)" << std::endl; +#endif + +#ifdef SB_ENABLE_USM +#ifdef VERBOSE + std::cout << "# USM allocations freed on memory pool destruction: " + << temp_usm_map_.size() << " (" << temp_usm_map_tot_byte_size_ + << " bytes)" << std::endl; +#endif + for (const temp_usm_map_t::value_type& p : temp_usm_map_) + cl::sycl::free(p.second, q_); +#endif + } + + inline queue_t get_queue() const { return q_; } + + template + typename helper::AllocHelper::type + acquire_buff_mem(size_t size); + + template + typename Temp_Mem_Pool::event_t release_buff_mem( + const typename Temp_Mem_Pool::event_t&, const container_t&); + +#ifdef SB_ENABLE_USM + template + typename helper::AllocHelper::type + acquire_usm_mem(size_t size); + + template + typename Temp_Mem_Pool::event_t release_usm_mem( + const typename Temp_Mem_Pool::event_t&, const container_t&); +#endif + + private: + static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1); + + static constexpr size_t max_size_temp_mem_ = 1e9; + queue_t q_; + + std::mutex temp_buffer_map_mutex_; + size_t temp_buffer_map_tot_byte_size_; + temp_buffer_map_t temp_buffer_map_; + + template + void release_usm_mem_(const container_t& mem); + +#ifdef SB_ENABLE_USM + std::mutex temp_usm_map_mutex_; + size_t temp_usm_map_tot_byte_size_; + temp_usm_map_t temp_usm_map_; + temp_usm_size_map_t temp_usm_size_map_; + + template + void release_buff_mem_(const container_t& mem); +#endif +}; +} // namespace blas +#endif diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 16c0c5741..059b9cc27 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -321,9 +321,10 @@ typename sb_handle_t::event_t _iamax_iamin_impl( localMemSize == 0 ? _nWG * (static_cast(localSize) / min_sg_size) : _nWG; - auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - tuple_t > (memory_size, q); + auto gpu_res = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + tuple_t > (memory_size); auto gpu_res_vec = make_vector_view(gpu_res, static_cast(1), memory_size); auto step0 = make_index_max_min(gpu_res_vec, tupOp); @@ -355,7 +356,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( static_cast(localSize), static_cast(localMemSize), ret)); } - blas::helper::enqueue_deallocate(ret, gpu_res, q); + sb_handle.template release_temp_mem({*ret.rbegin()}, gpu_res); } return ret; } diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 96340af0b..ac07cf499 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -87,16 +87,17 @@ typename sb_handle_t::event_t _gemv_impl( constexpr bool is_usm = std::is_pointer::value; typename sb_handle_t::event_t ret; + typename sb_handle_t::event_t lastEvent; // Non-local memory kernel if (memory_type != gemv_memory_t::local) { // Leading dimension for dot products matrix const auto ld = is_transposed ? _N : _M; constexpr index_t one = 1; - auto dot_products_buffer = blas::helper::allocate < is_usm + auto dot_products_buffer = sb_handle.template acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (ld, sb_handle.get_queue()); + element_t > (ld); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, one, ld); @@ -126,18 +127,20 @@ typename sb_handle_t::event_t _gemv_impl( // exectutes the above expression tree to yield the final GEMV result ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } else { auto alphaMulDotsOp = make_op(_alpha, dot_products_matrix); auto assignOp = make_op(vy, alphaMulDotsOp); ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } - blas::helper::enqueue_deallocate(ret, dot_products_buffer, - sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, dot_products_buffer); + } else // Local memory kernel { // Calculate number of work groups per each dimension based on the local @@ -159,10 +162,10 @@ typename sb_handle_t::event_t _gemv_impl( const auto dot_products_buffer_size = ld * WGs_per_C; // Create the dot products buffer and matrix view - auto dot_products_buffer = blas::helper::allocate < is_usm + auto dot_products_buffer = sb_handle.template acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (dot_products_buffer_size, sb_handle.get_queue()); + element_t > (dot_products_buffer_size); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, WGs_per_C, ld); @@ -196,17 +199,18 @@ typename sb_handle_t::event_t _gemv_impl( // exectutes the above expression tree to yield the final GEMV result ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } else { auto alphaMulDotsOp = make_op(_alpha, sumColsOp); auto assignOp = make_op(vy, alphaMulDotsOp); ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } - blas::helper::enqueue_deallocate(ret, dot_products_buffer, - sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, dot_products_buffer); } return ret; } @@ -263,9 +267,10 @@ typename sb_handle_t::event_t _trmv_impl( using element_t = typename ValueType::type; constexpr bool is_usm = std::is_pointer::value; - auto valT1 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize, sb_handle.get_queue()); + auto valT1 = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize); auto mat1 = make_matrix_view(valT1, N, scratchSize, scratchSize); if (data_layout_t::is_col_major()) { @@ -330,10 +335,12 @@ typename sb_handle_t::event_t _trmv_impl( } } auto addMOp = make_sum_matrix_columns(mat1); + typename sb_handle_t::event_t lastEvent; auto assignOp = make_op(vx, addMOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); + ret = concatenate_vectors( + ret, lastEvent = sb_handle.execute(assignOp, localSize, ret)); - blas::helper::enqueue_deallocate(ret, valT1, sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, valT1); return ret; } @@ -373,10 +380,10 @@ typename sb_handle_t::event_t _trsv_impl( auto queue = sb_handle.get_queue(); constexpr bool is_usm = std::is_pointer::value; - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -395,7 +402,7 @@ typename sb_handle_t::event_t _trsv_impl( static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + sb_handle.template release_temp_mem(ret, sync_buffer); return ret; #endif @@ -465,17 +472,19 @@ typename sb_handle_t::event_t _symv_impl( ((scratchPadSize == 0) ? std::min(N, localSize) : 1) * nWGPerCol_R; constexpr bool is_usm = std::is_pointer::value; - auto valTR = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize_R, sb_handle.get_queue()); + auto valTR = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_R); auto matR = make_matrix_view(valTR, N, scratchSize_R, scratchSize_R); const index_t scratchSize_C = nWGPerCol_C; - auto valTC = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize_C, sb_handle.get_queue()); + auto valTC = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_C); auto matC = make_matrix_view(valTC, N, scratchSize_C, scratchSize_C); @@ -510,10 +519,13 @@ typename sb_handle_t::event_t _symv_impl( auto scalOp2 = make_op(_alpha, addMOp); auto addOp = make_op(scalOp1, scalOp2); auto assignOp = make_op(vy, addOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); - blas::helper::enqueue_deallocate(ret, valTR, sb_handle.get_queue()); - blas::helper::enqueue_deallocate(ret, valTC, sb_handle.get_queue()); + typename sb_handle_t::event_t lastEvent; + ret = concatenate_vectors( + ret, lastEvent = sb_handle.execute(assignOp, localSize, ret)); + + sb_handle.template release_temp_mem(lastEvent, valTR); + sb_handle.template release_temp_mem(lastEvent, valTC); return ret; } @@ -646,9 +658,10 @@ typename sb_handle_t::event_t _tbmv_impl( constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto x_vector_size = _N; - auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (x_vector_size, sb_handle.get_queue()); + auto res_buffer = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (x_vector_size); typename MatrixViewType::type mA = make_matrix_view(_mA, _K + 1, _N, _lda); @@ -666,7 +679,7 @@ typename sb_handle_t::event_t _tbmv_impl( auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent); auto ret = concatenate_vectors(tbmvEvent, assignEvent); - blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + sb_handle.template release_temp_mem(assignEvent, res_buffer); return ret; } @@ -692,9 +705,10 @@ typename sb_handle_t::event_t _tpmv_impl( using element_t = typename ValueType::type; constexpr bool is_usm = std::is_pointer::value; - auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (vector_size, sb_handle.get_queue()); + auto res_buffer = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (vector_size); typename MatrixViewType::type mA = make_matrix_view(_mA, one, matrix_size, matrix_size); @@ -716,10 +730,11 @@ typename sb_handle_t::event_t _tpmv_impl( _dependencies); auto assignOp = make_op(vx, vres); - auto ret = - concatenate_vectors(tpmvEvent, sb_handle.execute(assignOp, tpmvEvent)); + typename sb_handle_t::event_t lastEvent; + auto ret = concatenate_vectors( + tpmvEvent, lastEvent = sb_handle.execute(assignOp, tpmvEvent)); - blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, res_buffer); return ret; } @@ -761,10 +776,10 @@ typename sb_handle_t::event_t _tbsv_impl( constexpr bool is_usm = std::is_pointer::value; auto queue = sb_handle.get_queue(); - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -782,7 +797,7 @@ typename sb_handle_t::event_t _tbsv_impl( static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + sb_handle.template release_temp_mem(ret, sync_buffer); return ret; #endif @@ -825,11 +840,10 @@ typename sb_handle_t::event_t _tpsv_impl( constexpr bool is_usm = std::is_pointer::value; auto queue = sb_handle.get_queue(); - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); - + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -847,7 +861,9 @@ typename sb_handle_t::event_t _tpsv_impl( roundUp(sub_num * _N, sub_num * subgroup_size), static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + + sb_handle.template release_temp_mem(ret, sync_buffer); + return ret; #endif } diff --git a/src/interface/trsm_interface.hpp b/src/interface/trsm_interface.hpp index 75996cb1d..1f5b4cb55 100644 --- a/src/interface/trsm_interface.hpp +++ b/src/interface/trsm_interface.hpp @@ -147,9 +147,10 @@ typename sb_handle_t::event_t _trsm( // filled with zeroes const index_t invASize = roundUp(K, blockSize) * blockSize; constexpr bool is_usm = std::is_pointer::value; - auto invA = helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (invASize, sb_handle.get_queue()); + auto invA = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (invASize); typename sb_handle_t::event_t event = {blas::helper::fill( sb_handle.get_queue(), invA, element_t{0}, invASize, _dependencies)}; trsmEvents = concatenate_vectors(trsmEvents, event); @@ -197,9 +198,10 @@ typename sb_handle_t::event_t _trsm( // output X will hold the TRSM result and will be copied to B at the end const index_t BSize = ldb * (N - 1) + M; const index_t ldx = ldb; - auto X = helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (BSize, sb_handle.get_queue()); + auto X = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (BSize); trsmEvents = concatenate_vectors( trsmEvents, internal::_copy( @@ -380,14 +382,15 @@ typename sb_handle_t::event_t _trsm( } // Copy bufferX to bufferB as the TRSM result + typename sb_handle_t::event_t lastEvent; trsmEvents = concatenate_vectors( - trsmEvents, - internal::_copy( - sb_handle, BSize, X, 1, B, 1, trsmEvents)); + trsmEvents, lastEvent = internal::_copy( + sb_handle, BSize, X, 1, B, 1, trsmEvents)); - helper::enqueue_deallocate(trsmEvents, invA, sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, invA); - helper::enqueue_deallocate(trsmEvents, X, sb_handle.get_queue()); + sb_handle.template release_temp_mem(lastEvent, X); return trsmEvents; } diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp index 2ad56f56f..c03b7b277 100644 --- a/src/sb_handle/portblas_handle.hpp +++ b/src/sb_handle/portblas_handle.hpp @@ -32,13 +32,70 @@ #include "operations/blas1_trees.hpp" #include "operations/blas2_trees.hpp" #include "operations/blas_operators.hpp" +#include "portblas_helper.h" #include "sb_handle/kernel_constructor.h" #include "sb_handle/portblas_handle.h" -#include "portblas_helper.h" +#include "sb_handle/temp_memory_pool.hpp" #include "views/view.h" - namespace blas { +template +typename std::enable_if< + alloc == helper::AllocType::buffer, + typename helper::AllocHelper::type>::type +SB_Handle::acquire_temp_mem(size_t size) { + if (tempMemPool_ != nullptr) + return tempMemPool_->acquire_buff_mem(size); + else + return make_sycl_iterator_buffer(size); +} + +template +typename std::enable_if< + std::is_same::type, + helper::AllocType::buffer>::type>::value, + typename SB_Handle::event_t>::type +SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, + const container_t& mem) { + if (tempMemPool_ != nullptr) + return tempMemPool_->release_buff_mem(dependencies, mem); + else + return {}; +} + +#ifdef SB_ENABLE_USM +template +typename std::enable_if< + alloc == helper::AllocType::usm, + typename helper::AllocHelper::type>::type +SB_Handle::acquire_temp_mem(size_t size) { + if (tempMemPool_ != nullptr) + return tempMemPool_->acquire_usm_mem(size); + else + return cl::sycl::malloc_device(size, q_); +} + +template +typename std::enable_if< + std::is_same::type, + helper::AllocType::usm>::type>::value, + typename SB_Handle::event_t>::type +SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, + const container_t& mem) { + if (tempMemPool_ != nullptr) + return tempMemPool_->release_usm_mem(dependencies, mem); + else { + cl::sycl::context context = q_.get_context(); + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() { cl::sycl::free(mem, context); }); + })}; + } +} +#endif + /*! * @brief Executes the tree without defining required shared memory. */ @@ -114,12 +171,12 @@ inline typename SB_Handle::event_t SB_Handle::execute( // Two accessors to local memory auto sharedSize = ((nWG < localSize) ? localSize : nWG); constexpr bool is_usm = std::is_pointer::value; - auto shMem1 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - typename lhs_t::value_t > (sharedSize, q_); - auto shMem2 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - typename lhs_t::value_t > (sharedSize, q_); + auto shMem1 = acquire_temp_mem < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize); + auto shMem2 = acquire_temp_mem < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize); auto opShMem1 = make_vector_view(shMem1, typename lhs_t::increment_t(1), sharedSize); @@ -150,9 +207,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( even = !even; } while (_N > 1); - blas::helper::enqueue_deallocate(event, shMem1, q_); + release_temp_mem({*event.rbegin()}, shMem1); - blas::helper::enqueue_deallocate(event, shMem2, q_); + release_temp_mem({*event.rbegin()}, shMem2); return event; } @@ -273,9 +330,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( /* First step: partial gemm */ /* Create the cube buffer that will hold the output of the partial gemm */ - auto cube_buffer = helper::allocate < is_usm ? helper::AllocType::usm + auto cube_buffer = acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (rows * cols * depth, q_); + element_t > (rows * cols * depth); /* Create a first matrix view used for the partial gemm */ auto cube_gemm = @@ -309,9 +366,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( /* Otherwise we reduce to a temporary buffer */ else { /* Create a temporary buffer to hold alpha * A * B */ - auto temp_buffer = helper::allocate < is_usm ? helper::AllocType::usm + auto temp_buffer = acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (rows * cols, q_); + element_t > (rows * cols); auto temp = make_matrix_view(temp_buffer, rows, cols, rows); /* Execute the reduction */ @@ -333,10 +390,10 @@ inline typename SB_Handle::event_t SB_Handle::execute( events = concatenate_vectors(events, execute(assignOp, events)); } - helper::enqueue_deallocate(events, temp_buffer, q_); + release_temp_mem(events, temp_buffer); } - helper::enqueue_deallocate(events, cube_buffer, q_); + release_temp_mem(events, cube_buffer); return events; } diff --git a/src/sb_handle/temp_memory_pool.hpp b/src/sb_handle/temp_memory_pool.hpp new file mode 100644 index 000000000..4e3a68c43 --- /dev/null +++ b/src/sb_handle/temp_memory_pool.hpp @@ -0,0 +1,118 @@ +#ifndef TEMP_MEMORY_POOL_HPP +#define TEMP_MEMORY_POOL_HPP + +#include "portblas_helper.h" + +namespace blas { +template +typename helper::AllocHelper::type +Temp_Mem_Pool::acquire_buff_mem(size_t size) { + const size_t pad = sizeof(double) / sizeof(value_t); + // Adjust the requested size in order to reinterpret for double's + size += (pad - size % pad); + const size_t byteSize = size * sizeof(value_t); + temp_buffer_map_mutex_.lock(); // lock + auto found = temp_buffer_map_.lower_bound(byteSize); + if (found != temp_buffer_map_.end()) { + cl::sycl::buffer buff = + found->second; + temp_buffer_map_tot_byte_size_ -= found->first; + temp_buffer_map_.erase(found); + temp_buffer_map_mutex_.unlock(); // unlock + return blas::BufferIterator{buff.reinterpret( + cl::sycl::range<1>(buff.byte_size() / sizeof(value_t)))}; + } else { + temp_buffer_map_mutex_.unlock(); // unlock +#ifdef VERBOSE + std::cout << "Create a temporary buffer of " << byteSize << " bytes." + << std::endl; +#endif + return make_sycl_iterator_buffer(size); + } +} + +template +void Temp_Mem_Pool::release_buff_mem_(const container_t& mem) { + const size_t byteSize = mem.get_buffer().byte_size(); + auto rebuff = + mem.get_buffer() + .template reinterpret( + cl::sycl::range<1>( + byteSize / + sizeof(temp_buffer_map_t::mapped_type::value_type))); + temp_buffer_map_mutex_.lock(); // lock + if (temp_buffer_map_tot_byte_size_ + byteSize <= max_size_temp_mem_) { + temp_buffer_map_tot_byte_size_ += byteSize; + temp_buffer_map_.emplace(byteSize, rebuff); + } + temp_buffer_map_mutex_.unlock(); // unlock +} + +template +typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_buff_mem( + const typename Temp_Mem_Pool::event_t& dependencies, + const container_t& mem) { + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([&, mem]() { release_buff_mem_(mem); }); + })}; +} + +#ifdef SB_ENABLE_USM +template +typename helper::AllocHelper::type +Temp_Mem_Pool::acquire_usm_mem(size_t size) { + const size_t byteSize = size * sizeof(value_t); + temp_usm_map_mutex_.lock(); // lock + auto found = temp_usm_map_.lower_bound(byteSize); + if (found != temp_usm_map_.end()) { + temp_usm_map_tot_byte_size_ -= found->first; + value_t* tmp = reinterpret_cast(found->second); + temp_usm_map_.erase(found); + temp_usm_map_mutex_.unlock(); // unlock + return tmp; + } else { + temp_usm_map_mutex_.unlock(); // unlock +#ifdef VERBOSE + std::cout << "Create a temporary USM allocation of " << byteSize + << " bytes." << std::endl; +#endif + value_t* tmp = cl::sycl::malloc_device(size, q_); + temp_usm_map_mutex_.lock(); // lock + temp_usm_size_map_.emplace( + reinterpret_cast(tmp), byteSize); + temp_usm_map_mutex_.unlock(); // unlock + return tmp; + } +} + +template +void Temp_Mem_Pool::release_usm_mem_(const container_t& mem) { + temp_usm_map_mutex_.lock(); // lock + auto found = temp_usm_size_map_.find( + reinterpret_cast(mem)); + const size_t byteSize = found->second; + if (temp_usm_map_tot_byte_size_ + byteSize > max_size_temp_mem_) { + temp_usm_size_map_.erase(found); + temp_usm_map_mutex_.unlock(); // unlock + cl::sycl::free(mem, q_); + } else { + temp_usm_map_tot_byte_size_ += byteSize; + temp_usm_map_.emplace(byteSize, + reinterpret_cast(mem)); + temp_usm_map_mutex_.unlock(); // unlock + } +} + +template +typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem( + const typename Temp_Mem_Pool::event_t& dependencies, + const container_t& mem) { + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([&, mem]() { release_usm_mem_(mem); }); + })}; +} +} +#endif +#endif diff --git a/tools/auto_tuner/CMakeLists.txt b/tools/auto_tuner/CMakeLists.txt index 234ad9ae2..bd39ad9f3 100644 --- a/tools/auto_tuner/CMakeLists.txt +++ b/tools/auto_tuner/CMakeLists.txt @@ -117,6 +117,9 @@ foreach(blas_tuner ${SYCL_AUTO_TUNNER_SRCS}) include/ ${CMAKE_CURRENT_BINARY_DIR} ) + if(BLAS_ENABLE_AUTO_TUNER_MEMPOOL) + target_compile_definitions(${tuner_exec} PRIVATE BLAS_ENABLE_AUTO_TUNER_MEMPOOL) + endif() add_dependencies(${tuner_exec} tuner_generate_def_file) if(is_dpcpp) target_link_libraries(${tuner_exec} PRIVATE DPCPP::DPCPP) diff --git a/tools/auto_tuner/README.md b/tools/auto_tuner/README.md index 35633ec88..08155c52a 100644 --- a/tools/auto_tuner/README.md +++ b/tools/auto_tuner/README.md @@ -21,6 +21,17 @@ $ ninja See the Setup section in this repository's main readme for more details. +Make options +------------ + +CMake options are given using `-D` immediately followed by the option name, the +symbol `=` and a value (`ON` and `OFF` can be used for boolean options and are +equivalent to 1 and 0). Example: `-DBLAS_ENABLE_TESTING=OFF` + +| name | value | description | +|---|---|---| +| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Enable the scratchpad memory pool, useful just in case of tall skinny matrices. `OFF` by default | + Usage ----- diff --git a/tools/auto_tuner/gen/generate_combinations.py b/tools/auto_tuner/gen/generate_combinations.py index 4c10f250b..083d84bde 100644 --- a/tools/auto_tuner/gen/generate_combinations.py +++ b/tools/auto_tuner/gen/generate_combinations.py @@ -393,7 +393,7 @@ def write_source_files(config_list, config_source, output_dir): #define INSTANTIATE_TUNE(DTYPE, TRA, TRB, MEM, ALGO, BATCH, VEC, ...) \ template TestResultEntry \ tune<__VA_ARGS__, GemmConfig, DTYPE>( \ - int r, GemmArgs a); + portblas_handle_t &sb_handle, int r, GemmArgs a); #define BENCH_PARAMS(MEM, ALGO, BATCH, VEC, ...) \ INSTANTIATE_TUNE(float, false, false, MEM, ALGO, BATCH, VEC, __VA_ARGS__) \ diff --git a/tools/auto_tuner/include/gemm_tuner.hpp b/tools/auto_tuner/include/gemm_tuner.hpp index b3e67b702..e3ad3e391 100644 --- a/tools/auto_tuner/include/gemm_tuner.hpp +++ b/tools/auto_tuner/include/gemm_tuner.hpp @@ -67,11 +67,10 @@ inline std::vector interleaved_to_strided( } template -static TestResultEntry tune_portblas(int r, char transA, char transB, - GemmArgs a, +static TestResultEntry tune_portblas(portblas_handle_t &sb_handle, int r, + char transA, char transB, GemmArgs a, ::blas::gemm_batch_type_t batch_type) { TestResultEntry result("portBLAS gemm"); - auto sb_handle = get_portblas_handle(); { auto event = blas::helper::copy_to_device( sb_handle.get_queue(), a.init_c.data(), a.c, a.init_c.size()); @@ -98,7 +97,8 @@ static TestResultEntry tune_portblas(int r, char transA, char transB, } template -void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, +void run_tune_gemm(portblas_handle_t &sb_handle, int seed, int m, int k, int n, + int batch_size, int rep, ::blas::gemm_batch_type_t batch_type) { std::cout << std::scientific; @@ -157,7 +157,8 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, device_c, result_c, ldc, batch_size, expected_c}; { - auto result = tune_portblas(rep, *ta_str, *tb_str, args, batch_type); + auto result = + tune_portblas(sb_handle, rep, *ta_str, *tb_str, args, batch_type); results.push_back(result); } @@ -165,7 +166,7 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, do { \ auto result = \ tune<__VA_ARGS__, GemmConfig, \ - DataType>(rep, args); \ + DataType>(sb_handle, rep, args); \ results.push_back(result); \ } while (0); @@ -173,7 +174,7 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, #undef BENCH_PARAMS std::cout << "SIZE : " << results.size() << std::endl; - get_portblas_handle().wait(); + sb_handle.wait(); std::sort(results.begin(), results.end()); results.print_all(); } diff --git a/tools/auto_tuner/include/tune.hpp b/tools/auto_tuner/include/tune.hpp index ea6e75ea8..723e03284 100644 --- a/tools/auto_tuner/include/tune.hpp +++ b/tools/auto_tuner/include/tune.hpp @@ -30,6 +30,6 @@ template -TestResultEntry tune(int r, GemmArgs a); +TestResultEntry tune(portblas_handle_t &sb_handle, int r, GemmArgs a); #endif // PORTBLAS_TOOLS_AUTO_TUNER_TUNE_HPP_ diff --git a/tools/auto_tuner/include/tune_impl.hpp b/tools/auto_tuner/include/tune_impl.hpp index bd2f15685..f456ff9df 100644 --- a/tools/auto_tuner/include/tune_impl.hpp +++ b/tools/auto_tuner/include/tune_impl.hpp @@ -33,7 +33,7 @@ template -TestResultEntry tune(int r, GemmArgs a) { +TestResultEntry tune(portblas_handle_t &sb_handle, int r, GemmArgs a) { using Gemm = ::blas::Gemm< MatrixContainer, MatrixContainer, DoubleBuffer, Nbca, Nbcb, Cls, Tile, Config::TransA, Config::TransB, Config::SymmA, Config::SymmB, T, @@ -41,7 +41,6 @@ TestResultEntry tune(int r, GemmArgs a) { static_cast(Config::ShapeMode), static_cast(Config::VecType), VecSize, static_cast(Config::BatchType)>; TestResultEntry result(Gemm::get_type_string()); - auto sb_handle = get_portblas_handle(); { { auto event = blas::helper::copy_to_device( diff --git a/tools/auto_tuner/include/utils.hpp b/tools/auto_tuner/include/utils.hpp index a7d0e25c9..4bcbebc04 100644 --- a/tools/auto_tuner/include/utils.hpp +++ b/tools/auto_tuner/include/utils.hpp @@ -34,7 +34,7 @@ #include #include -inline portblas_handle_t make_portblas_handle() { +inline cl::sycl::queue make_sycl_queue() { cl::sycl::queue q( [=](cl::sycl::exception_list ex_list) { try { @@ -50,13 +50,7 @@ inline portblas_handle_t make_portblas_handle() { << q.get_device().get_info() << std::endl; - portblas_handle_t sb_handle(q); - return sb_handle; -} - -inline portblas_handle_t &get_portblas_handle() { - static portblas_handle_t sb_handle = make_portblas_handle(); - return sb_handle; + return q; } template diff --git a/tools/auto_tuner/src/tune_all.cpp b/tools/auto_tuner/src/tune_all.cpp index 499f8235b..1e32ac595 100644 --- a/tools/auto_tuner/src/tune_all.cpp +++ b/tools/auto_tuner/src/tune_all.cpp @@ -53,15 +53,26 @@ int main(int argc, char *argv[]) { return -1; } } + +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else + portblas_handle_t sb_handle(make_sycl_queue()); +#endif + std::cout << "======= testing nn ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); std::cout << "======= testing nt ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); std::cout << "======= testing tn ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); std::cout << "======= testing tt ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_nn.cpp b/tools/auto_tuner/src/tune_nn.cpp index 36265a9d3..5b40bf296 100644 --- a/tools/auto_tuner/src/tune_nn.cpp +++ b/tools/auto_tuner/src/tune_nn.cpp @@ -56,8 +56,16 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else + portblas_handle_t sb_handle(make_sycl_queue()); +#endif + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_nt.cpp b/tools/auto_tuner/src/tune_nt.cpp index 7fc3a0b14..c98eb3caf 100644 --- a/tools/auto_tuner/src/tune_nt.cpp +++ b/tools/auto_tuner/src/tune_nt.cpp @@ -56,8 +56,16 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else + portblas_handle_t sb_handle(make_sycl_queue()); +#endif + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_tn.cpp b/tools/auto_tuner/src/tune_tn.cpp index d19845339..811ea7b04 100644 --- a/tools/auto_tuner/src/tune_tn.cpp +++ b/tools/auto_tuner/src/tune_tn.cpp @@ -56,8 +56,16 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else + portblas_handle_t sb_handle(make_sycl_queue()); +#endif + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_tt.cpp b/tools/auto_tuner/src/tune_tt.cpp index 245878c42..5fe2b7290 100644 --- a/tools/auto_tuner/src/tune_tt.cpp +++ b/tools/auto_tuner/src/tune_tt.cpp @@ -56,8 +56,16 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else + portblas_handle_t sb_handle(make_sycl_queue()); +#endif + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; }