Skip to content

Commit

Permalink
Introduce temporary memory pool allocations within SB_Handler (#478)
Browse files Browse the repository at this point in the history
This patch aims to improve performance  by removing excessive memory
allocation overhead and re-using the existing memory from the pool of
allocations available.
  • Loading branch information
pgorlani authored Jan 25, 2024
1 parent 15fc7be commit 9d515e3
Show file tree
Hide file tree
Showing 23 changed files with 521 additions and 109 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ endif()
option(BLAS_ENABLE_CONST_INPUT "Whether to enable kernel instantiation with const input buffer" ON)
option(BLAS_ENABLE_BENCHMARK "Whether to enable benchmarking" ON)
option(BLAS_VERIFY_BENCHMARK "Whether to verify the results of benchmarks" ON)
option(BLAS_MEMPOOL_BENCHMARK "Whether to use the memory pool in benchmarks" OFF)
option(BUILD_CLBLAST_BENCHMARKS "Whether to build clBLAST benchmarks" OFF)
option(BUILD_CLBLAS_BENCHMARKS "Whether to build clBLAS benchmarks" OFF)
option(BUILD_CUBLAS_BENCHMARKS "Whether to build cuBLAS benchmarks" OFF)
Expand All @@ -240,6 +241,7 @@ if (BLAS_BUILD_SAMPLES)
endif()

option(BLAS_ENABLE_AUTO_TUNERS "Whether to enable building GEMM auto tuners" OFF)
option(BLAS_ENABLE_AUTO_TUNER_MEMPOOL "Whether to enable memory pool for GEMM auto tuners" OFF)
if(${BLAS_ENABLE_AUTO_TUNERS})
# Note that the auto tuners are very slow to compile, so we avoid adding
# them to the ALL target.
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ Some of the supported options are:
| `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) |
| `ENABLE_EXPRESSION_TESTS` | `ON`/`OFF` | Build additional tests that use the header-only framework (e.g to test expression trees); `OFF` by default |
| `BLAS_VERIFY_BENCHMARK` | `ON`/`OFF` | Verify the results of the benchmarks instead of only measuring the performance. See the documentation of the benchmarks for more details. `ON` by default |
| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
| `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
| `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
| `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
Expand Down
4 changes: 4 additions & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ foreach(portblas_bench ${sources})
)
target_include_directories(bench_${bench_exec} PRIVATE ${PORTBLAS_INCLUDE} ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR})

if(BLAS_MEMPOOL_BENCHMARK)
target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_MEMPOOL_BENCHMARK)
endif()

if(BLAS_VERIFY_BENCHMARK)
target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_VERIFY_BENCHMARK)
target_link_libraries(bench_${bench_exec} PRIVATE blas::blas)
Expand Down
6 changes: 6 additions & 0 deletions benchmark/portblas/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,14 @@ int main(int argc, char** argv) {

utils::print_queue_information(q);

#ifdef BLAS_MEMPOOL_BENCHMARK
blas::Temp_Mem_Pool mp(q);
// Create a portBLAS sb_handle from the memory pool
blas::SB_Handle sb_handle(&mp);
#else
// Create a portBLAS sb_handle from the queue
blas::SB_Handle sb_handle(q);
#endif

// This will be set to false by a failing benchmark
bool success = true;
Expand Down
43 changes: 42 additions & 1 deletion include/sb_handle/portblas_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "operations/blas3_trees.h"
#include "operations/extension/reduction.h"
#include "portblas_helper.h"
#include "temp_memory_pool.h"

namespace blas {

/** SB_Handle.
Expand All @@ -47,11 +49,49 @@ class SB_Handle {
public:
using event_t = std::vector<cl::sycl::event>;
inline SB_Handle(queue_t q)
: q_(q),
: tempMemPool_(nullptr),
q_(q),
workGroupSize_(helper::get_work_group_size(q)),
localMemorySupport_(helper::has_local_memory(q)),
computeUnits_(helper::get_num_compute_units(q)) {}

inline SB_Handle(Temp_Mem_Pool* tmp)
: tempMemPool_(tmp),
q_(tmp->get_queue()),
workGroupSize_(helper::get_work_group_size(q_)),
localMemorySupport_(helper::has_local_memory(q_)),
computeUnits_(helper::get_num_compute_units(q_)) {}

template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
alloc == helper::AllocType::buffer,
typename helper::AllocHelper<value_t, alloc>::type>::type
acquire_temp_mem(size_t size);

template <typename container_t>
typename std::enable_if<
std::is_same<container_t, typename helper::AllocHelper<
typename ValueType<container_t>::type,
helper::AllocType::buffer>::type>::value,
typename SB_Handle::event_t>::type
release_temp_mem(const typename SB_Handle::event_t&, const container_t&);

#ifdef SB_ENABLE_USM
template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
alloc == helper::AllocType::usm,
typename helper::AllocHelper<value_t, alloc>::type>::type
acquire_temp_mem(size_t size);

template <typename container_t>
typename std::enable_if<
std::is_same<container_t, typename helper::AllocHelper<
typename ValueType<container_t>::type,
helper::AllocType::usm>::type>::value,
typename SB_Handle::event_t>::type
release_temp_mem(const typename SB_Handle::event_t&, const container_t&);
#endif

template <typename expression_tree_t>
event_t execute(expression_tree_t tree, const event_t& dependencies = {});

Expand Down Expand Up @@ -151,6 +191,7 @@ class SB_Handle {
const size_t workGroupSize_;
const bool localMemorySupport_;
const size_t computeUnits_;
Temp_Mem_Pool* tempMemPool_;
};

} // namespace blas
Expand Down
112 changes: 112 additions & 0 deletions include/sb_handle/temp_memory_pool.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/***************************************************************************
*
* @license
* Copyright (C) Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* portBLAS: BLAS implementation using SYCL
*
* @filename temp_memory_pool.h
*
**************************************************************************/
#ifndef TEMP_MEMORY_POOL_H
#define TEMP_MEMORY_POOL_H

#include <map>
#include <mutex>

namespace blas {
class Temp_Mem_Pool {
using queue_t = cl::sycl::queue;
using event_t = std::vector<cl::sycl::event>;
using temp_usm_map_t = std::multimap<size_t, void*>;
using temp_usm_size_map_t = std::map<void*, size_t>;
using temp_buffer_map_t = std::multimap<size_t, cl::sycl::buffer<int8_t, 1>>;

public:
Temp_Mem_Pool(queue_t q)
: q_(q),
temp_buffer_map_tot_byte_size_(0),
temp_usm_map_tot_byte_size_(0) {}
Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete;
Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete;

~Temp_Mem_Pool() {
// Wait for the completion of all the host tasks
q_.wait();

#ifdef VERBOSE
std::cout << "# buffers destroyed on memory pool destruction: "
<< temp_buffer_map_.size() << " ("
<< temp_buffer_map_tot_byte_size_ << " bytes)" << std::endl;
#endif

#ifdef SB_ENABLE_USM
#ifdef VERBOSE
std::cout << "# USM allocations freed on memory pool destruction: "
<< temp_usm_map_.size() << " (" << temp_usm_map_tot_byte_size_
<< " bytes)" << std::endl;
#endif
for (const temp_usm_map_t::value_type& p : temp_usm_map_)
cl::sycl::free(p.second, q_);
#endif
}

inline queue_t get_queue() const { return q_; }

template <typename value_t>
typename helper::AllocHelper<value_t, helper::AllocType::buffer>::type
acquire_buff_mem(size_t size);

template <typename container_t>
typename Temp_Mem_Pool::event_t release_buff_mem(
const typename Temp_Mem_Pool::event_t&, const container_t&);

#ifdef SB_ENABLE_USM
template <typename value_t>
typename helper::AllocHelper<value_t, helper::AllocType::usm>::type
acquire_usm_mem(size_t size);

template <typename container_t>
typename Temp_Mem_Pool::event_t release_usm_mem(
const typename Temp_Mem_Pool::event_t&, const container_t&);
#endif

private:
static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1);

static constexpr size_t max_size_temp_mem_ = 1e9;
queue_t q_;

std::mutex temp_buffer_map_mutex_;
size_t temp_buffer_map_tot_byte_size_;
temp_buffer_map_t temp_buffer_map_;

template <typename container_t>
void release_usm_mem_(const container_t& mem);

#ifdef SB_ENABLE_USM
std::mutex temp_usm_map_mutex_;
size_t temp_usm_map_tot_byte_size_;
temp_usm_map_t temp_usm_map_;
temp_usm_size_map_t temp_usm_size_map_;

template <typename container_t>
void release_buff_mem_(const container_t& mem);
#endif
};
} // namespace blas
#endif
9 changes: 5 additions & 4 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,10 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
localMemSize == 0
? _nWG * (static_cast<index_t>(localSize) / min_sg_size)
: _nWG;
auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
tuple_t > (memory_size, q);
auto gpu_res = sb_handle.template acquire_temp_mem < is_usm
? helper::AllocType::usm
: helper::AllocType::buffer,
tuple_t > (memory_size);
auto gpu_res_vec =
make_vector_view(gpu_res, static_cast<increment_t>(1), memory_size);
auto step0 = make_index_max_min<is_max, true>(gpu_res_vec, tupOp);
Expand Down Expand Up @@ -355,7 +356,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
static_cast<index_t>(localSize),
static_cast<index_t>(localMemSize), ret));
}
blas::helper::enqueue_deallocate(ret, gpu_res, q);
sb_handle.template release_temp_mem({*ret.rbegin()}, gpu_res);
}
return ret;
}
Expand Down
Loading

0 comments on commit 9d515e3

Please sign in to comment.