diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a0dc42e6f5..e4a3b0211e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -208,10 +208,8 @@ TiledArray/util/vector.h ) if(CUDA_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/cuda.h - TiledArray/external/librett.h TiledArray/cuda/cublas.h TiledArray/cuda/btas_cublas.h TiledArray/cuda/btas_um_tensor.h @@ -225,9 +223,18 @@ if(CUDA_FOUND) TiledArray/cuda/thrust.h TiledArray/cuda/allocators.h TiledArray/cuda/um_storage.h) +endif(CUDA_FOUND) +if(HIP_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/hip.h) endif(CUDA_FOUND) +if(HIP_FOUND OR CUDA_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/librett.h) +endif() + set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp TiledArray/tensor/tensor.cpp @@ -275,10 +282,19 @@ if(CUDA_FOUND) LANGUAGE CUDA) # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt) endif(CUDA_FOUND) +if (HIP_FOUND) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host) +endif() + +# LibreTT needed for either CUDA or HIP +if(CUDA_FOUND OR HIP_FOUND) + list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT) +endif() + if( TARGET TiledArray_SCALAPACK ) list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_SCALAPACK) endif() diff --git a/src/TiledArray/external/hip.h b/src/TiledArray/external/hip.h new file mode 100644 index 0000000000..75dbfc6955 --- /dev/null +++ b/src/TiledArray/external/hip.h @@ -0,0 +1,466 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * July 23, 2018 + * + */ + +#ifndef TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#define TILEDARRAY_EXTERNAL_HIP_H__INCLUDED + +#include +#include +#include + +#include + +#ifdef TILEDARRAY_HAS_HIP + +#include + +#include + +#include +#include +#include +#include + +#include + +#define HipSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) +#define HipSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) +#define HipCheckError() __hipCheckError(__FILE__, __LINE__) + +inline void __hipSafeCall(hipError_t err, const char* file, const int line) { + if (hipSuccess != err) { + std::stringstream ss; + ss << "hipSafeCall() failed at: " << file << ":" << line << ": "; + ss << hipGetErrorString(err); + throw std::runtime_error(ss.str()); + } +} + +inline void __hipSafeCallNoThrow(hipError_t err, const char* file, + const int line) { + if (hipSuccess != err) { + madness::print_error("hipSafeCallNoThrow() failed at: ", file, ":", line, + ": ", hipGetErrorString(err)); + } +} + +inline void __hipCheckError(const char* file, const int line) { + auto err = hipGetLastError(); + if (hipSuccess != err) { + std::stringstream ss; + ss << "hipCheckError() failed at: " << file << ":" << line << ": "; + ss << hipGetErrorString(err); + throw std::runtime_error(ss.str()); + } +} + +namespace TiledArray { + +namespace detail { + +inline int num_streams() { + int num_streams = -1; + char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); + /// default num of streams is 3 + if (num_stream_char) { + num_streams = std::atoi(num_stream_char); + } else { + num_streams = 3; + } + return num_streams; +} + +inline int num_devices() { + int num_devices = -1; + HipSafeCall(hipGetDeviceCount(&num_devices)); + return num_devices; +} + +inline int current_device_id(World& world) { + int mpi_local_size = -1; + int mpi_local_rank = -1; + std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); + + int num_devices = detail::num_devices(); + + int device_id = -1; + // devices may already be pre-mapped + // if mpi_local_size <= num_devices : all ranks are in same resource set, map + // round robin + if (mpi_local_size <= num_devices) { + device_id = mpi_local_rank % num_devices; + } else { // mpi_local_size > num_devices + char* cvd_cstr = std::getenv("HIP_VISIBLE_DEVICES"); + if (cvd_cstr) { // HIP_VISIBLE_DEVICES is set, assume that pre-mapped + // make sure that there is only 1 device available here + if (num_devices != 1) { + throw std::runtime_error( + std::string( + "HIP_VISIBLE_DEVICES environment variable is set, hence using " + "the provided device-to-rank mapping; BUT TiledArray found ") + + std::to_string(num_devices) + + " HIP devices; only 1 HIP device / MPI process is supported"); + } + device_id = 0; + } else { // not enough devices + devices are not pre-mapped + throw std::runtime_error( + std::string("TiledArray found ") + std::to_string(mpi_local_size) + + " MPI ranks on a node with " + std::to_string(num_devices) + + " HIP devices; only 1 MPI process / HIP device model is currently " + "supported"); + } + } + + return device_id; +} + +inline void HIPRT_CB hip_readyflag_callback(void* userData) { + // convert void * to std::atomic + std::atomic* flag = static_cast*>(userData); + // set the flag to be true + flag->store(true); +} + +struct ProbeFlag { + ProbeFlag(std::atomic* f) : flag(f) {} + + bool operator()() const { return flag->load(); } + + std::atomic* flag; +}; + +inline void thread_wait_stream(const hipStream_t& stream) { + std::atomic* flag = new std::atomic(false); + + HipSafeCall(hipLaunchHostFunc(stream, detail::hip_readyflag_callback, flag)); + + detail::ProbeFlag probe(flag); + + // wait with sleep and do not do work + madness::ThreadPool::await(probe, false, true); + // madness::ThreadPool::await(probe, true, true); + + delete flag; +} + +} // namespace detail + +inline const hipStream_t*& tls_stream_accessor() { + static thread_local const hipStream_t* thread_local_stream_ptr{nullptr}; + return thread_local_stream_ptr; +} + +inline void synchronize_stream(const hipStream_t* stream) { + tls_stream_accessor() = stream; +} + +/** + * hipEnv maintains the HIP-related part of the runtime environment, + * such as HIP-specific memory allocators + * + * \note this is a Singleton + */ +class hipEnv { + public: + ~hipEnv() { + // destroy streams on current device + for (auto& stream : streams_) { + HipSafeCallNoThrow(hipStreamDestroy(stream)); + } + } + + hipEnv(const hipEnv&) = delete; + hipEnv(hipEnv&&) = delete; + hipEnv& operator=(const hipEnv&) = delete; + hipEnv& operator=(hipEnv&&) = delete; + + /// access the singleton instance; if not initialized will be + /// initialized via hipEnv::initialize() with the default params + static std::unique_ptr& instance() { + if (!instance_accessor()) { + initialize(); + } + return instance_accessor(); + } + + // clang-format off + /// initialize the instance using explicit params + /// \param world the world to use for initialization + /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least + /// this size (bytes) [default=2^25] + /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that + /// allocator returned by `this->pinned_allocator()` can allocate; + /// this allocator is not used by default [default=0] + // clang-format on + static void initialize(World& world = TiledArray::get_default_world(), + const std::uint64_t page_size = (1ul << 25), + const std::uint64_t pinned_alloc_limit = (1ul << 40)) { + static std::mutex mtx; // to make initialize() reentrant + std::scoped_lock lock{mtx}; + // only the winner of the lock race gets to initialize + if (instance_accessor() == nullptr) { + int num_streams = detail::num_streams(); + int num_devices = detail::num_devices(); + int device_id = detail::current_device_id(world); + // set device for current MPI process .. will be set in the ctor as well + HipSafeCall(hipSetDevice(device_id)); + HipSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + + // uncomment to debug umpire ops + // + // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel( + // umpire::util::message::Debug); + + // make Thread Safe UM Dynamic POOL + + auto& rm = umpire::ResourceManager::getInstance(); + + auto mem_total_free = hipEnv::memory_total_and_free_device(); + + // turn off Umpire introspection for non-Debug builds +#ifndef NDEBUG + constexpr auto introspect = true; +#else + constexpr auto introspect = false; +#endif + + // allocate all currently-free memory for UM pool + auto um_dynamic_pool = + rm.makeAllocator( + "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, + pinned_alloc_limit); + + // allocate zero memory for device pool + auto dev_size_limited_alloc = + rm.makeAllocator( + "size_limited_alloc", rm.getAllocator("DEVICE"), + mem_total_free.first); + auto dev_dynamic_pool = + rm.makeAllocator( + "HIPDynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); + + // allocate pinned_alloc_limit in pinned memory + auto pinned_size_limited_alloc = + rm.makeAllocator( + "SizeLimited_PINNED", rm.getAllocator("PINNED"), + pinned_alloc_limit); + auto pinned_dynamic_pool = + rm.makeAllocator( + "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, + page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); + + auto hip_env = std::unique_ptr( + new hipEnv(world, num_devices, device_id, num_streams, + um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); + instance_accessor() = std::move(hip_env); + } + } + + World& world() const { return *world_; } + + int num_devices() const { return num_devices_; } + + int current_device_id() const { return current_device_id_; } + + int num_streams() const { return num_streams_; } + + bool concurrent_managed_access() const { + return device_concurrent_managed_access_; + } + + size_t stream_id(const hipStream_t& stream) const { + auto it = std::find(streams_.begin(), streams_.end(), stream); + if (it == streams_.end()) abort(); + return it - streams_.begin(); + } + + /// @return the total size of all and free device memory on the current device + static std::pair memory_total_and_free_device() { + std::pair result; + // N.B. hipMemGetInfo returns {free,total} + HipSafeCall(hipMemGetInfo(&result.second, &result.first)); + return result; + } + + /// Collective call to probe HIP {total,free} memory + + /// @return the total size of all and free device memory on every rank's + /// device + std::vector> memory_total_and_free() const { + auto world_size = world_->size(); + std::vector total_memory(world_size, 0), free_memory(world_size, 0); + auto rank = world_->rank(); + std::tie(total_memory.at(rank), free_memory.at(rank)) = + hipEnv::memory_total_and_free_device(); + world_->gop.sum(total_memory.data(), total_memory.size()); + world_->gop.sum(free_memory.data(), free_memory.size()); + std::vector> result(world_size); + for (int r = 0; r != world_size; ++r) { + result.at(r) = {total_memory.at(r), free_memory.at(r)}; + } + return result; + } + + const hipStream_t& stream(std::size_t i) const { return streams_.at(i); } + + const hipStream_t& stream_h2d() const { return streams_[num_streams_]; } + + const hipStream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } + + /// @return a (non-thread-safe) Umpire allocator for device UM + umpire::Allocator& um_allocator() { return um_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by um_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()` + // clang-format on + std::size_t um_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + um_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + um_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + /// @return a (non-thread-safe) Umpire allocator for device memory + umpire::Allocator& device_allocator() { return device_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by um_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()` + // clang-format on + std::size_t device_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + device_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + device_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + /// @return an Umpire allocator that allocates from a + /// pinned memory pool + /// @warning this is not a thread-safe allocator, should be only used when + /// wrapped into umpire_allocator_impl + umpire::Allocator& pinned_allocator() { return pinned_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by pinned_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()` + // clang-format on + std::size_t pinned_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + pinned_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + pinned_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + protected: + hipEnv(World& world, int num_devices, int device_id, int num_streams, + umpire::Allocator um_alloc, umpire::Allocator device_alloc, + umpire::Allocator pinned_alloc) + : world_(&world), + um_allocator_(um_alloc), + device_allocator_(device_alloc), + pinned_allocator_(pinned_alloc), + num_devices_(num_devices), + current_device_id_(device_id), + num_streams_(num_streams) { + if (num_devices <= 0) { + throw std::runtime_error("No HIP-Enabled GPUs Found!\n"); + } + + // set device for current MPI process + HipSafeCall(hipSetDevice(current_device_id_)); + + /// check the capability of HIP device + hipDeviceProp prop; + HipSafeCall(hipGetDeviceProperties(&prop, device_id)); + if (!prop.managedMemory) { + throw std::runtime_error("HIP Device doesn't support managedMemory\n"); + } + int concurrent_managed_access; + HipSafeCall(hipDeviceGetAttribute(&concurrent_managed_access, + hipDeviceAttributeConcurrentManagedAccess, + device_id)); + device_concurrent_managed_access_ = concurrent_managed_access; + if (!device_concurrent_managed_access_) { + std::cout << "\nWarning: HIP Device doesn't support " + "ConcurrentManagedAccess!\n\n"; + } + + // creates streams on current device + streams_.resize(num_streams_ + 2); + for (auto& stream : streams_) { + HipSafeCall(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + std::cout << "created " << num_streams_ << " HIP streams + 2 I/O streams" + << std::endl; + } + + private: + // the world used to initialize this + World* world_; + + /// allocator backed by a (non-thread-safe) dynamically-sized pool for UM + umpire::Allocator um_allocator_; + /// allocator backed by a (non-thread-safe) dynamically-sized pool for device + /// memory + umpire::Allocator device_allocator_; + // allocates from a dynamic, size-limited pinned memory pool + // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + umpire::Allocator pinned_allocator_; + + int num_devices_; + int current_device_id_; + bool device_concurrent_managed_access_; + + int num_streams_; + std::vector streams_; + + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; + return instance_; + } +}; + +namespace detail { + +template +const hipStream_t& get_stream_based_on_range(const Range& range) { + // TODO better way to get stream based on the id of tensor + auto stream_id = range.offset() % hipEnv::instance()->num_streams(); + auto& stream = hipEnv::instance()->stream(stream_id); + return stream; +} + +} // namespace detail + +} // namespace TiledArray + +#endif // TILEDARRAY_HAS_HIP + +#endif // TILEDARRAY_EXTERNAL_HIP_H__INCLUDED