From 8beeb6e546da36483a2d3feb3a1fa3071cb5938c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 17:05:56 -0400 Subject: [PATCH] phase 1 of cuda -> device conversion - cuda.h -> device.h - {cuda,hip} -> device for many API elements --- examples/cuda/cuda_librett.cpp | 2 +- examples/cuda/cuda_task.cpp | 12 +- examples/cuda/ta_cc_abcd_cuda.cpp | 4 +- examples/cuda/ta_dense_cuda.cpp | 42 +- examples/cuda/ta_reduce_cuda.cpp | 10 +- examples/cuda/ta_vector_cuda.cpp | 12 +- src/CMakeLists.txt | 54 +- src/TiledArray/config.h.in | 11 + src/TiledArray/cuda/btas_um_tensor.cpp | 51 -- src/TiledArray/{cuda => device}/allocators.h | 53 +- src/TiledArray/{cuda => device}/btas_cublas.h | 288 +++++----- src/TiledArray/device/btas_um_tensor.cpp | 52 ++ .../{cuda => device}/btas_um_tensor.h | 210 +++---- .../{cuda => device}/cpu_cuda_vector.cu | 2 +- .../{cuda => device}/cpu_cuda_vector.h | 12 +- src/TiledArray/{cuda => device}/cublas.h | 0 .../device_task_fn.h} | 287 +++++----- .../{cuda => device}/kernel/mult_kernel.cu | 4 +- .../{cuda => device}/kernel/mult_kernel.h | 0 .../kernel/mult_kernel_impl.h | 4 +- .../{cuda => device}/kernel/reduce_kernel.cu | 4 +- .../{cuda => device}/kernel/reduce_kernel.h | 0 .../kernel/reduce_kernel_impl.h | 8 +- src/TiledArray/{cuda => device}/platform.h | 8 +- src/TiledArray/{cuda => device}/thrust.h | 0 src/TiledArray/{cuda => device}/um_storage.cu | 16 +- src/TiledArray/{cuda => device}/um_storage.h | 39 +- src/TiledArray/dist_eval/binary_eval.h | 8 +- src/TiledArray/dist_eval/contraction_eval.h | 19 +- src/TiledArray/dist_eval/dist_eval.h | 2 +- src/TiledArray/dist_eval/unary_eval.h | 6 +- src/TiledArray/expressions/expr.h | 22 +- src/TiledArray/external/cuda.h | 512 +----------------- src/TiledArray/external/{hip.h => device.h} | 371 ++++++++++--- src/TiledArray/external/umpire.h | 32 +- src/TiledArray/fwd.h | 39 +- src/TiledArray/host/allocator.h | 9 +- src/TiledArray/host/env.h | 4 +- src/TiledArray/reduce_task.h | 111 ++-- src/TiledArray/tensor/type_traits.h | 10 +- src/TiledArray/tiledarray.cpp | 40 +- tests/CMakeLists.txt | 6 +- tests/expressions_cuda_um.cpp | 6 +- tests/librett.cpp | 18 +- tests/tensor_um.cpp | 2 +- 45 files changed, 1071 insertions(+), 1331 deletions(-) delete mode 100644 src/TiledArray/cuda/btas_um_tensor.cpp rename src/TiledArray/{cuda => device}/allocators.h (63%) rename src/TiledArray/{cuda => device}/btas_cublas.h (66%) create mode 100644 src/TiledArray/device/btas_um_tensor.cpp rename src/TiledArray/{cuda => device}/btas_um_tensor.h (81%) rename src/TiledArray/{cuda => device}/cpu_cuda_vector.cu (98%) rename src/TiledArray/{cuda => device}/cpu_cuda_vector.h (96%) rename src/TiledArray/{cuda => device}/cublas.h (100%) rename src/TiledArray/{cuda/cuda_task_fn.h => device/device_task_fn.h} (72%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel.cu (96%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel.h (100%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel_impl.h (95%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.cu (98%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.h (100%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel_impl.h (95%) rename src/TiledArray/{cuda => device}/platform.h (93%) rename src/TiledArray/{cuda => device}/thrust.h (100%) rename src/TiledArray/{cuda => device}/um_storage.cu (66%) rename src/TiledArray/{cuda => device}/um_storage.h (78%) rename src/TiledArray/external/{hip.h => device.h} (53%) diff --git a/examples/cuda/cuda_librett.cpp b/examples/cuda/cuda_librett.cpp index c513f41af1..1460f54117 100644 --- a/examples/cuda/cuda_librett.cpp +++ b/examples/cuda/cuda_librett.cpp @@ -23,7 +23,7 @@ #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #include diff --git a/examples/cuda/cuda_task.cpp b/examples/cuda/cuda_task.cpp index a019523b6e..f2b0b2ab1b 100644 --- a/examples/cuda/cuda_task.cpp +++ b/examples/cuda/cuda_task.cpp @@ -2,8 +2,8 @@ // Created by Chong Peng on 11/14/18. // -#include -#include +#include +#include #include using value_type = double; @@ -28,8 +28,8 @@ void verify(const tile_type& tile, value_type value, std::size_t index) { tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, std::size_t index) { - CudaSafeCall( - cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice( + TiledArray::deviceEnv::instance()->current_device_id())); /// make result Tensor using Storage = typename tile_type::tensor_type::storage_type; Storage result_storage; @@ -81,7 +81,7 @@ void process_task(madness::World* world, tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*, std::size_t) = &::scale; - madness::Future scale_future = madness::add_cuda_task( + madness::Future scale_future = madness::add_device_task( *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i); /// this should start until scale_taskfn is finished @@ -98,7 +98,7 @@ int try_main(int argc, char** argv) { std::vector streams(n_stream); for (auto& stream : streams) { // create the streams - CudaSafeCall(cudaStreamCreate(&stream)); + DeviceSafeCall(cudaStreamCreate(&stream)); // std::cout << "stream: " << stream << "\n"; } diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp index 0887c90562..b531dee495 100644 --- a/examples/cuda/ta_cc_abcd_cuda.cpp +++ b/examples/cuda/ta_cc_abcd_cuda.cpp @@ -17,7 +17,7 @@ * */ -#include +#include #include #include #include @@ -186,7 +186,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, std::pow(n_uocc, 4) / std::pow(1024., 3); using CUDATile = - btas::Tensor>; + btas::Tensor>; using CUDAMatrix = TA::DistArray>; // Construct tensors diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp index ab8c118622..864938302c 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/cuda/ta_dense_cuda.cpp @@ -24,8 +24,8 @@ // clang-format off #include -#include -#include "TiledArray/cuda/cpu_cuda_vector.h" +#include +#include "TiledArray/device/cpu_cuda_vector.h" #include // clang-format on @@ -98,7 +98,7 @@ void to_host( // do norm on GPU auto tile_norm = norm(tile.tensor()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); return tile_norm; @@ -120,7 +120,7 @@ void to_device( btas::Tensor>> &tile) { auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); return norm(tile.tensor()); @@ -218,7 +218,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, using PinnedTile = btas::Tensor>>; + TiledArray::device_pinned_allocator>>; using PinnedMatrix = TA::DistArray>; // using TAMatrix = TA::DistArray>; @@ -339,7 +339,7 @@ int try_main(int argc, char **argv) { << std::endl << "Usage: " << argv[0] << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] " - "[storage type = cuda_um_btas_varray]\n"; + "[storage type = device_um_btas_varray]\n"; return 0; } const long Nm = atol(argv[1]); @@ -376,15 +376,15 @@ int try_main(int argc, char **argv) { return 1; } - const auto storage_type = - (argc >= 10) ? std::string(argv[9]) : std::string{"cuda_um_btas_varray"}; + const auto storage_type = (argc >= 10) ? std::string(argv[9]) + : std::string{"device_um_btas_varray"}; - if (storage_type != "cuda_um_btas_varray" && + if (storage_type != "device_um_btas_varray" && storage_type != "cuda_um_thrust_vector" && storage_type != "cpu_cuda_vector") { std::cerr << "Error: invalid storage type: " << storage_type << "\n Valid option includes: cuda_um_vector or " - "cuda_um_btas_varray or cuda_um_thrust_vector " + "device_um_btas_varray or cuda_um_thrust_vector " "or cpu_cuda_vector. \n"; } std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">" @@ -407,13 +407,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -440,9 +440,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); @@ -462,19 +462,19 @@ int try_main(int argc, char **argv) { // do_main_body>(world, Nm, Bm, Nn, // Bn, // Nk, Bk, nrepeat); - // } else if (storage_type == "cuda_um_btas_varray") { - if (storage_type == "cuda_um_btas_varray") { + // } else if (storage_type == "device_um_btas_varray") { + if (storage_type == "device_um_btas_varray") { if (scalar_type_str == "double") - do_main_body>( + do_main_body>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "float") - do_main_body>(world, Nm, Bm, Nn, - Bn, Nk, Bk, nrepeat); + do_main_body>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "zdouble") - do_main_body>>( + do_main_body>>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "zfloat") - do_main_body>>( + do_main_body>>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else { abort(); // unreachable diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp index e453069892..b475ff78ef 100644 --- a/examples/cuda/ta_reduce_cuda.cpp +++ b/examples/cuda/ta_reduce_cuda.cpp @@ -24,7 +24,7 @@ // clang-format off #include -#include +#include // clang-format on template @@ -298,13 +298,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -331,9 +331,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp index 1593a68e8b..a82a057807 100644 --- a/examples/cuda/ta_vector_cuda.cpp +++ b/examples/cuda/ta_vector_cuda.cpp @@ -24,8 +24,8 @@ // clang-format off #include -#include -#include "TiledArray/cuda/cpu_cuda_vector.h" +#include +#include "TiledArray/device/cpu_cuda_vector.h" #include // clang-format on @@ -316,13 +316,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -349,9 +349,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4a3b0211e..5dd9c234fd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -207,34 +207,30 @@ TiledArray/util/time.h TiledArray/util/vector.h ) -if(CUDA_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES - TiledArray/external/cuda.h - TiledArray/cuda/cublas.h - TiledArray/cuda/btas_cublas.h - TiledArray/cuda/btas_um_tensor.h - TiledArray/cuda/cpu_cuda_vector.h - TiledArray/cuda/cuda_task_fn.h - TiledArray/cuda/kernel/mult_kernel.h - TiledArray/cuda/kernel/mult_kernel_impl.h - TiledArray/cuda/kernel/reduce_kernel.h - TiledArray/cuda/kernel/reduce_kernel_impl.h - TiledArray/cuda/platform.h - TiledArray/cuda/thrust.h - TiledArray/cuda/allocators.h - TiledArray/cuda/um_storage.h) -endif(CUDA_FOUND) - -if(HIP_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES - TiledArray/external/hip.h) -endif(CUDA_FOUND) - if(HIP_FOUND OR CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/device.h TiledArray/external/librett.h) endif() +if(CUDA_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/cuda.h + TiledArray/device/cublas.h + TiledArray/device/btas_cublas.h + TiledArray/device/btas_um_tensor.h + TiledArray/device/cpu_cuda_vector.h + TiledArray/device/device_task_fn.h + TiledArray/device/kernel/mult_kernel.h + TiledArray/device/kernel/mult_kernel_impl.h + TiledArray/device/kernel/reduce_kernel.h + TiledArray/device/kernel/reduce_kernel_impl.h + TiledArray/device/platform.h + TiledArray/device/thrust.h + TiledArray/device/allocators.h + TiledArray/device/um_storage.h) +endif(CUDA_FOUND) + set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp TiledArray/tensor/tensor.cpp @@ -263,11 +259,11 @@ set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers if(CUDA_FOUND) set(TILEDARRAY_CUDA_SOURCE_FILES - TiledArray/cuda/btas_um_tensor.cpp - TiledArray/cuda/cpu_cuda_vector.cu - TiledArray/cuda/kernel/mult_kernel.cu - TiledArray/cuda/kernel/reduce_kernel.cu - TiledArray/cuda/um_storage.cu) + TiledArray/device/btas_um_tensor.cpp + TiledArray/device/cpu_cuda_vector.cu + TiledArray/device/kernel/mult_kernel.cu + TiledArray/device/kernel/reduce_kernel.cu + TiledArray/device/um_storage.cu) list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}") @@ -277,7 +273,7 @@ if(CUDA_FOUND) INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}") endforeach() - set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp + set_source_files_properties(TiledArray/device/btas_um_tensor.cpp PROPERTIES LANGUAGE CUDA) diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 4cba5ee840..1c38298623 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -80,6 +80,17 @@ /* Define if TiledArray configured with HIP support */ #cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@ +// Umpire and LibreTT limited to 1 device runtime at a time, so is TA +#if defined(TILEDARRAY_HAS_HIP) +# define TILEDARRAY_HAS_DEVICE 1 +# define TILEDARRAY_DEVICE_RUNTIME HIP +# define TILEDARRAY_DEVICE_RUNTIME_STR "HIP" +#elif defined(TILEDARRAY_HAS_CUDA) +# define TILEDARRAY_HAS_DEVICE 1 +# define TILEDARRAY_DEVICE_RUNTIME CUDA +# define TILEDARRAY_DEVICE_RUNTIME_STR "CUDA" +#endif + /* Is TA::Tensor memory profiling enabled? */ #cmakedefine TA_TENSOR_MEM_PROFILE 1 diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp deleted file mode 100644 index 9423e7563d..0000000000 --- a/src/TiledArray/cuda/btas_um_tensor.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// -// Created by Chong Peng on 7/24/18. -// - -// clang-format off -#include // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14) -#include -// clang-format on - -#ifdef TILEDARRAY_HAS_CUDA - -template class btas::varray>; -template class btas::varray>; -template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; -template class btas::varray, - TiledArray::cuda_um_allocator>>; -template class btas::varray>; -template class btas::varray>; - -template class btas::Tensor>; -template class btas::Tensor>; -template class btas::Tensor< - std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; -template class btas::Tensor< - std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; -template class btas::Tensor>; -template class btas::Tensor>; - -template class TiledArray::Tile>>; -template class TiledArray::Tile>>; -template class TiledArray::Tile< - btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -template class TiledArray::Tile< - btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -template class TiledArray::Tile< - btas::Tensor>>; -template class TiledArray::Tile>>; - -#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/allocators.h b/src/TiledArray/device/allocators.h similarity index 63% rename from src/TiledArray/cuda/allocators.h rename to src/TiledArray/device/allocators.h index 72c5ae3b0e..ff3ed6a3ac 100644 --- a/src/TiledArray/cuda/allocators.h +++ b/src/TiledArray/device/allocators.h @@ -26,9 +26,9 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include #include @@ -39,37 +39,40 @@ namespace TiledArray { template -class cuda_allocator_impl : public umpire_allocator_impl { +class umpire_based_allocator + : public umpire_based_allocator_impl { public: - using base_type = umpire_allocator_impl; + using base_type = umpire_based_allocator_impl; using typename base_type::const_pointer; using typename base_type::const_reference; using typename base_type::pointer; using typename base_type::reference; using typename base_type::value_type; - cuda_allocator_impl() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} + umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} template - cuda_allocator_impl( - const cuda_allocator_impl& + umpire_based_allocator( + const umpire_based_allocator& rhs) noexcept : base_type( - static_cast&>(rhs)) {} + static_cast&>( + rhs)) {} template friend bool operator==( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& + lhs, + const umpire_based_allocator& rhs) noexcept; -}; // class cuda_allocator_impl +}; // class umpire_based_allocator template bool operator==( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& lhs, + const umpire_based_allocator& rhs) noexcept { return lhs.umpire_allocator() == rhs.umpire_allocator(); } @@ -77,8 +80,8 @@ bool operator==( template bool operator!=( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& lhs, + const umpire_based_allocator& rhs) noexcept { return !(lhs == rhs); } @@ -87,13 +90,13 @@ namespace detail { struct get_um_allocator { umpire::Allocator& operator()() { - return cudaEnv::instance()->um_allocator(); + return deviceEnv::instance()->um_allocator(); } }; struct get_pinned_allocator { umpire::Allocator& operator()() { - return cudaEnv::instance()->pinned_allocator(); + return deviceEnv::instance()->pinned_allocator(); } }; @@ -106,30 +109,30 @@ namespace archive { template -struct ArchiveLoadImpl> { static inline void load( const Archive& ar, - TiledArray::cuda_allocator_impl& - allocator) { - allocator = TiledArray::cuda_allocator_impl{}; + TiledArray::umpire_based_allocator& allocator) { + allocator = TiledArray::umpire_based_allocator{}; } }; template -struct ArchiveStoreImpl> { static inline void store( const Archive& ar, - const TiledArray::cuda_allocator_impl< + const TiledArray::umpire_based_allocator< T, StaticLock, UmpireAllocatorAccessor>& allocator) {} }; } // namespace archive } // namespace madness -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED diff --git a/src/TiledArray/cuda/btas_cublas.h b/src/TiledArray/device/btas_cublas.h similarity index 66% rename from src/TiledArray/cuda/btas_cublas.h rename to src/TiledArray/device/btas_cublas.h index ea073d0a78..9ac97ce649 100644 --- a/src/TiledArray/cuda/btas_cublas.h +++ b/src/TiledArray/device/btas_cublas.h @@ -24,7 +24,7 @@ #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED -#include +#include #include #ifdef TILEDARRAY_HAS_CUDA @@ -32,16 +32,16 @@ #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include namespace TiledArray { template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_gemm_cuda_impl( const btas::Tensor &left, const btas::Tensor &right, Scalar factor, @@ -78,44 +78,44 @@ btas::Tensor btas_tensor_gemm_cuda_impl( T factor_t = T(factor); T zero(0); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // typedef typename Tensor::storage_type storage_type; auto result_range = gemm_helper.make_result_range(left.range(), right.range()); - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); // the result Tensor type typedef btas::Tensor Tensor; Tensor result; // check if stream is busy - // auto stream_status = cudaStreamQuery(cuda_stream); + // auto stream_status = cudaStreamQuery(stream); // if stream is completed, use GPU // if (stream_status == cudaSuccess) { if (true) { Storage result_storage; - make_device_storage(result_storage, result_range.area(), cuda_stream); + make_device_storage(result_storage, result_range.area(), stream); result = Tensor(std::move(result_range), std::move(result_storage)); // left and right are readonly!! // cudaMemAdvise(device_data(left), left.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // cudaMemAdvise(device_data(right), right.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), to_cublas_op(gemm_helper.left_op()), n, m, k, @@ -124,40 +124,41 @@ btas::Tensor btas_tensor_gemm_cuda_impl( device_data(result.storage()), n)); // wait for cuda calls to finish - // detail::thread_wait_cuda_stream(cuda_stream); - synchronize_stream(&cuda_stream); + // detail::thread_wait_stream(stream); + device::synchronize_stream(&stream); } // otherwise, use CPU else { Storage result_storage(result_range.area()); result = Tensor(std::move(result_range), std::move(result_storage)); - TiledArray::to_execution_space( - result.storage(), cuda_stream); + TiledArray::to_execution_space( + result.storage(), stream); // left and right are readonly!! cudaMemAdvise(device_data(left), left.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); cudaMemAdvise(device_data(right), right.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, - k, factor_t, left.data(), lda, right.data(), ldb, - zero, result.data(), n); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + + TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), + m, n, k, factor_t, left.data(), lda, + right.data(), ldb, zero, result.data(), n); } return result; } -template >> +template >> void btas_tensor_gemm_cuda_impl( btas::Tensor &result, const btas::Tensor &left, @@ -224,13 +225,13 @@ void btas_tensor_gemm_cuda_impl( const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); T factor_t = T(factor); T one(1); // check if stream is busy - // auto stream_status = cudaStreamQuery(cuda_stream); + // auto stream_status = cudaStreamQuery(stream); // if stream is completed, use GPU // if (stream_status == cudaSuccess) { @@ -238,50 +239,50 @@ void btas_tensor_gemm_cuda_impl( // left and right are readonly!! // cudaMemAdvise(device_data(left), left.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // cudaMemAdvise(device_data(right), right.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // prefetch all data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - TiledArray::to_execution_space( - result.storage(), cuda_stream); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + TiledArray::to_execution_space( + result.storage(), stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), to_cublas_op(gemm_helper.left_op()), n, m, k, &factor_t, device_data(right.storage()), ldb, device_data(left.storage()), lda, &one, device_data(result.storage()), n)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); - // detail::thread_wait_cuda_stream(cuda_stream); + // detail::thread_wait_stream(stream); } else { // left and right are readonly!! cudaMemAdvise(device_data(left), left.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); cudaMemAdvise(device_data(right), right.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - TiledArray::to_execution_space( - result.storage(), cuda_stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, - k, factor_t, left.data(), lda, right.data(), ldb, - one, result.data(), n); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + TiledArray::to_execution_space( + result.storage(), stream); + + TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), + m, n, k, factor_t, left.data(), lda, + right.data(), ldb, one, result.data(), n); } } @@ -289,69 +290,69 @@ void btas_tensor_gemm_cuda_impl( template btas::Tensor btas_tensor_clone_cuda_impl( const btas::Tensor &arg) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); Storage result_storage; auto result_range = arg.range(); - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); - make_device_storage(result_storage, arg.size(), cuda_stream); + make_device_storage(result_storage, arg.size(), stream); btas::Tensor result(std::move(result_range), std::move(result_storage)); // call cublasCopy const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] = a * arg[i] template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_scale_cuda_impl( const btas::Tensor &arg, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(arg.range()); // std::cout << "scale, tile offset: " << arg.range().offset() << " stream: " - // << arg.range().offset() % cudaEnv::instance()->num_cuda_streams() << "\n"; + // << arg.range().offset() % deviceEnv::instance()->num_streams() << "\n"; auto result = btas_tensor_clone_cuda_impl(arg); // call cublasScale const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall( cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] *= a template >> + typename = std::enable_if_t>> void btas_tensor_scale_to_cuda_impl(btas::Tensor &result, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); // call cublasScale const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall( cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] - a * arg2[i] template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_subt_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2, const Scalar a) { @@ -360,12 +361,12 @@ btas::Tensor btas_tensor_subt_cuda_impl( // revert the sign of a auto b = -a; - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); - if (in_memory_space(result.storage())) { + if (in_memory_space(result.storage())) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &b, device_data(arg2.storage()), 1, device_data(result.storage()), 1)); @@ -374,82 +375,85 @@ btas::Tensor btas_tensor_subt_cuda_impl( // btas::axpy(1.0, arg, result); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] -= a * arg1[i] -template >> +template >> void btas_tensor_subt_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg1, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); // revert the sign of a auto b = -a; const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &b, device_data(arg1.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] + a * arg2[i] -template >> +template >> btas::Tensor btas_tensor_add_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2, const Scalar a) { auto result = btas_tensor_clone_cuda_impl(arg1); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &a, device_data(arg2.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] += a * arg[i] -template >> +template >> void btas_tensor_add_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); - // TiledArray::to_execution_space(result.storage(),cuda_stream); - // TiledArray::to_execution_space(arg.storage(),cuda_stream); + // TiledArray::to_execution_space(result.storage(),stream); + // TiledArray::to_execution_space(arg.storage(),stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &a, device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = result[i] * arg[i] template void btas_tensor_mult_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg) { - auto device_id = cudaEnv::instance()->current_cuda_device_id(); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + auto device_id = deviceEnv::instance()->current_device_id(); + auto &stream = detail::get_stream_based_on_range(result.range()); std::size_t n = result.size(); TA_ASSERT(n == arg.size()); - mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id); - synchronize_stream(&cuda_stream); + mult_to_cuda_kernel(result.data(), arg.data(), n, stream, device_id); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] * arg2[i] @@ -461,19 +465,19 @@ btas::Tensor btas_tensor_mult_cuda_impl( TA_ASSERT(arg2.size() == n); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); - CudaSafeCall(cudaSetDevice(device_id)); - auto &cuda_stream = detail::get_stream_based_on_range(arg1.range()); + auto device_id = deviceEnv::instance()->current_device_id(); + DeviceSafeCall(device::setDevice(device_id)); + auto &stream = detail::get_stream_based_on_range(arg1.range()); Storage result_storage; - make_device_storage(result_storage, n, cuda_stream); + make_device_storage(result_storage, n, stream); btas::Tensor result(arg1.range(), std::move(result_storage)); - mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream, + mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -481,24 +485,24 @@ btas::Tensor btas_tensor_mult_cuda_impl( template typename btas::Tensor::value_type btas_tensor_squared_norm_cuda_impl(const btas::Tensor &arg) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); + auto &stream = detail::get_stream_based_on_range(arg.range()); auto &storage = arg.storage(); using TiledArray::math::blas::integer; integer size = storage.size(); T result = 0; - if (in_memory_space(storage)) { + if (in_memory_space(storage)) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasDot(handle, size, device_data(storage), 1, device_data(storage), 1, &result)); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -507,9 +511,9 @@ template typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg1.range()); + auto &stream = detail::get_stream_based_on_range(arg1.range()); using TiledArray::math::blas::integer; integer size = arg1.storage().size(); @@ -517,101 +521,101 @@ typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( TA_ASSERT(size == arg2.storage().size()); T result = 0; - if (in_memory_space(arg1.storage()) && - in_memory_space(arg2.storage())) { + if (in_memory_space(arg1.storage()) && + in_memory_space(arg2.storage())) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1, device_data(arg2.storage()), 1, &result)); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_sum_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = sum_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_product_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = product_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_min_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = min_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_max_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = max_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_absmin_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = absmin_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_absmax_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = absmax_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp new file mode 100644 index 0000000000..270f30aad4 --- /dev/null +++ b/src/TiledArray/device/btas_um_tensor.cpp @@ -0,0 +1,52 @@ +// +// Created by Chong Peng on 7/24/18. +// + +// clang-format off +#include // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14) +#include +// clang-format on + +#ifdef TILEDARRAY_HAS_CUDA + +template class btas::varray>; +template class btas::varray>; +template class btas::varray< + std::complex, + TiledArray::device_um_allocator>>; +template class btas::varray< + std::complex, TiledArray::device_um_allocator>>; +template class btas::varray>; +template class btas::varray>; + +template class btas::Tensor>; +template class btas::Tensor>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::device_um_btas_varray>>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::device_um_btas_varray>>; +template class btas::Tensor>; +template class btas::Tensor>; + +template class TiledArray::Tile>>; +template class TiledArray::Tile>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::device_um_btas_varray>>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::device_um_btas_varray>>>; +template class TiledArray::Tile>>; +template class TiledArray::Tile>>; + +#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h similarity index 81% rename from src/TiledArray/cuda/btas_um_tensor.h rename to src/TiledArray/device/btas_um_tensor.h index 6342c54771..0c448a24b2 100644 --- a/src/TiledArray/cuda/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -27,11 +27,12 @@ #include #include +#include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include -#include +#include +#include #include #include @@ -39,15 +40,15 @@ namespace TiledArray { namespace detail { template -struct is_cuda_tile< - ::btas::Tensor>> +struct is_device_tile< + ::btas::Tensor>> : public std::true_type {}; template -void to_cuda(const TiledArray::btasUMTensorVarray &tile) { - cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()); +void to_device(const TiledArray::btasUMTensorVarray &tile) { + device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.storage(), stream); } @@ -64,12 +65,12 @@ struct ArchiveLoadImpl> { static inline void load(const Archive &ar, TiledArray::btasUMTensorVarray &t) { TiledArray::Range range{}; - TiledArray::cuda_um_btas_varray store{}; + TiledArray::device_um_btas_varray store{}; ar &range &store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); - // cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()); + // device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); // auto &stream = TiledArray::detail::get_stream_based_on_range(range); - // TiledArray::to_execution_space(t.storage(), + // TiledArray::to_execution_space(t.storage(), // stream); } }; @@ -78,11 +79,11 @@ template struct ArchiveStoreImpl> { static inline void store(const Archive &ar, const TiledArray::btasUMTensorVarray &t) { - CudaSafeCall(cudaSetDevice( - TiledArray::cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(TiledArray::device::setDevice( + TiledArray::deviceEnv::instance()->current_device_id())); auto &stream = TiledArray::detail::get_stream_based_on_range(t.range()); - TiledArray::to_execution_space(t.storage(), - stream); + TiledArray::to_execution_space( + t.storage(), stream); ar &t.range() & t.storage(); } }; @@ -135,25 +136,25 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, // shift the range result_range.inplace_shift(range_shift); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // @important select the stream using the shifted range - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); typename btasUMTensorVarray::storage_type result_storage; - make_device_storage(result_storage, result_range.volume(), cuda_stream); + make_device_storage(result_storage, result_range.volume(), stream); btasUMTensorVarray result(std::move(result_range), std::move(result_storage)); // call cublasCopy const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -176,7 +177,7 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, const TiledArray::Permutation &perm) { // compute result range auto result_range = perm * arg.range(); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // compute the stream to use auto &stream = detail::get_stream_based_on_range(result_range); @@ -192,7 +193,7 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, librett_permute(const_cast(device_data(arg.storage())), device_data(result.storage()), arg.range(), perm, stream); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return result; } @@ -205,7 +206,7 @@ template >> btasUMTensorVarray scale(const btasUMTensorVarray &arg, const Scalar factor) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_scale_cuda_impl(arg, factor); } @@ -213,7 +214,7 @@ template >> btasUMTensorVarray &scale_to(btasUMTensorVarray &arg, const Scalar factor) { - detail::to_cuda(arg); + detail::to_device(arg); btas_tensor_scale_to_cuda_impl(arg, factor); return arg; } @@ -227,8 +228,8 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, auto result = scale(arg, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -239,7 +240,7 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, template btasUMTensorVarray neg(const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_scale_cuda_impl(arg, T(-1.0)); } @@ -251,15 +252,15 @@ btasUMTensorVarray neg(const btasUMTensorVarray &arg, auto result = neg(arg); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } template btasUMTensorVarray &neg_to(btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); btas_tensor_scale_to_cuda_impl(arg, T(-1.0)); return arg; } @@ -271,8 +272,8 @@ btasUMTensorVarray &neg_to(btasUMTensorVarray &arg) { template btasUMTensorVarray subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0)); } @@ -295,8 +296,8 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, auto result = subt(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -311,8 +312,8 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, auto result = subt(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -325,8 +326,8 @@ template btasUMTensorVarray &subt_to( btasUMTensorVarray &result, const btasUMTensorVarray &arg1) { - detail::to_cuda(result); - detail::to_cuda(arg1); + detail::to_device(result); + detail::to_device(arg1); btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0)); return result; } @@ -348,8 +349,8 @@ btasUMTensorVarray &subt_to(btasUMTensorVarray &result, template btasUMTensorVarray add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0)); } @@ -373,8 +374,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, auto result = add(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -388,8 +389,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, auto result = add(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -401,8 +402,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, template btasUMTensorVarray &add_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { - detail::to_cuda(result); - detail::to_cuda(arg); + detail::to_device(result); + detail::to_device(arg); btas_tensor_add_to_cuda_impl(result, arg, T(1.0)); return result; } @@ -424,8 +425,8 @@ template typename btasUMTensorVarray::value_type dot( const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_dot_cuda_impl(arg1, arg2); } @@ -435,8 +436,8 @@ typename btasUMTensorVarray::value_type dot( template btasUMTensorVarray mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_mult_cuda_impl(arg1, arg2); } @@ -459,8 +460,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, auto result = mult(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -475,8 +476,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, auto result = mult(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -487,8 +488,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, template btasUMTensorVarray &mult_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { - detail::to_cuda(result); - detail::to_cuda(arg); + detail::to_device(result); + detail::to_device(arg); btas_tensor_mult_to_cuda_impl(result, arg); return result; } @@ -514,7 +515,7 @@ btasUMTensorVarray &mult_to(btasUMTensorVarray &result, template typename btasUMTensorVarray::value_type squared_norm( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_squared_norm_cuda_impl(arg); } @@ -525,7 +526,7 @@ typename btasUMTensorVarray::value_type squared_norm( template typename btasUMTensorVarray::value_type norm( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg)); } @@ -544,7 +545,7 @@ typename btasUMTensorVarray::value_type trace( template typename btasUMTensorVarray::value_type sum( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_sum_cuda_impl(arg); } @@ -554,7 +555,7 @@ typename btasUMTensorVarray::value_type sum( template typename btasUMTensorVarray::value_type product( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_product_cuda_impl(arg); } @@ -564,7 +565,7 @@ typename btasUMTensorVarray::value_type product( template typename btasUMTensorVarray::value_type max( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_max_cuda_impl(arg); } @@ -574,7 +575,7 @@ typename btasUMTensorVarray::value_type max( template typename btasUMTensorVarray::value_type abs_max( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_absmax_cuda_impl(arg); } @@ -584,7 +585,7 @@ typename btasUMTensorVarray::value_type abs_max( template typename btasUMTensorVarray::value_type min( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_min_cuda_impl(arg); } @@ -594,7 +595,7 @@ typename btasUMTensorVarray::value_type min( template typename btasUMTensorVarray::value_type abs_min( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_absmin_cuda_impl(arg); } @@ -603,10 +604,11 @@ template void to_host( TiledArray::DistArray, Policy> &um_array) { auto to_host = [](TiledArray::Tile &tile) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); }; @@ -622,7 +624,7 @@ void to_host( } world.gop.fence(); - CudaSafeCall(cudaDeviceSynchronize()); + DeviceSafeCall(cudaDeviceSynchronize()); }; /// to device for UM Array @@ -630,10 +632,11 @@ template void to_device( TiledArray::DistArray, Policy> &um_array) { auto to_device = [](TiledArray::Tile &tile) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); }; @@ -649,7 +652,7 @@ void to_device( } world.gop.fence(); - CudaSafeCall(cudaDeviceSynchronize()); + DeviceSafeCall(device::deviceSynchronize()); }; /// convert array from UMTensor to TiledArray::Tensor @@ -661,12 +664,12 @@ um_tensor_to_ta_tensor( const auto convert_tile_memcpy = [](const UMTensor &tile) { TATensor result(tile.tensor().range()); - auto &stream = cudaEnv::instance()->cuda_stream_d2h(); - CudaSafeCall( + auto &stream = deviceEnv::instance()->stream_d2h(); + DeviceSafeCall( cudaMemcpyAsync(result.data(), tile.data(), tile.size() * sizeof(typename TATensor::value_type), cudaMemcpyDefault, stream)); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return result; }; @@ -676,10 +679,11 @@ um_tensor_to_ta_tensor( using std::begin; const auto n = tile.tensor().size(); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); std::copy_n(tile.data(), n, result.data()); @@ -714,28 +718,30 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { auto convert_tile_memcpy = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); using Tensor = typename UMTensor::tensor_type; - auto &stream = cudaEnv::instance()->cuda_stream_h2d(); + auto &stream = deviceEnv::instance()->stream_h2d(); typename Tensor::storage_type storage; make_device_storage(storage, tile.range().area(), stream); Tensor result(tile.range(), std::move(storage)); - CudaSafeCall( + DeviceSafeCall( cudaMemcpyAsync(result.data(), tile.data(), tile.size() * sizeof(typename Tensor::value_type), cudaMemcpyDefault, stream)); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return TiledArray::Tile(std::move(result)); }; auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); using Tensor = typename UMTensor::tensor_type; typename Tensor::storage_type storage(tile.range().area()); @@ -749,7 +755,7 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { auto &stream = detail::get_stream_based_on_range(result.range()); // prefetch data to GPU - TiledArray::to_execution_space( + TiledArray::to_execution_space( result.storage(), stream); return TiledArray::Tile(std::move(result)); @@ -778,47 +784,49 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { #ifndef TILEDARRAY_HEADER_ONLY extern template class btas::varray>; -extern template class btas::varray>; + TiledArray::device_um_allocator>; +extern template class btas::varray>; extern template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; + std::complex, + TiledArray::device_um_allocator>>; extern template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; -extern template class btas::varray>; -extern template class btas::varray>; + std::complex, TiledArray::device_um_allocator>>; +extern template class btas::varray>; +extern template class btas::varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor< std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; + TiledArray::device_um_btas_varray>>; extern template class btas::Tensor< std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; + TiledArray::device_um_btas_varray>>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class TiledArray::Tile>>; + double, TiledArray::Range, TiledArray::device_um_btas_varray>>; extern template class TiledArray::Tile>>; + float, TiledArray::Range, TiledArray::device_um_btas_varray>>; extern template class TiledArray::Tile< btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; + TiledArray::device_um_btas_varray>>>; extern template class TiledArray::Tile< btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -extern template class TiledArray::Tile< - btas::Tensor>>; + TiledArray::device_um_btas_varray>>>; +extern template class TiledArray::Tile>>; extern template class TiledArray::Tile>>; + long, TiledArray::Range, TiledArray::device_um_btas_varray>>; #endif // TILEDARRAY_HEADER_ONLY -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H diff --git a/src/TiledArray/cuda/cpu_cuda_vector.cu b/src/TiledArray/device/cpu_cuda_vector.cu similarity index 98% rename from src/TiledArray/cuda/cpu_cuda_vector.cu rename to src/TiledArray/device/cpu_cuda_vector.cu index 6c58fdd123..639cc56acc 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.cu +++ b/src/TiledArray/device/cpu_cuda_vector.cu @@ -1,5 +1,5 @@ -#include +#include namespace thrust { diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/device/cpu_cuda_vector.h similarity index 96% rename from src/TiledArray/cuda/cpu_cuda_vector.h rename to src/TiledArray/device/cpu_cuda_vector.h index 5a6e52beb5..8c7b32900a 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.h +++ b/src/TiledArray/device/cpu_cuda_vector.h @@ -4,8 +4,8 @@ #include -#include -#include +#include +#include #include @@ -165,8 +165,8 @@ template bool in_memory_space( const cpu_cuda_vector& vec) noexcept { - return (vec.on_host() && overlap(MemorySpace::CPU, Space)) || - (vec.on_device() && overlap(MemorySpace::CUDA, Space)); + return (vec.on_host() && overlap(MemorySpace::Host, Space)) || + (vec.on_device() && overlap(MemorySpace::Device, Space)); } template & vec, cudaStream_t stream = 0) { switch (Space) { - case ExecutionSpace::CPU: { + case ExecutionSpace::Host: { vec.to_host(); break; } - case ExecutionSpace::CUDA: { + case ExecutionSpace::Device: { vec.to_device(); break; } diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/device/cublas.h similarity index 100% rename from src/TiledArray/cuda/cublas.h rename to src/TiledArray/device/cublas.h diff --git a/src/TiledArray/cuda/cuda_task_fn.h b/src/TiledArray/device/device_task_fn.h similarity index 72% rename from src/TiledArray/cuda/cuda_task_fn.h rename to src/TiledArray/device/device_task_fn.h index 8de133b3bd..a4b9db92e4 100644 --- a/src/TiledArray/cuda/cuda_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -7,23 +7,22 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include -#include #include namespace TiledArray { namespace detail { template -std::atomic& cuda_callback_duration_ns() { +std::atomic& device_callback_duration_ns() { static std::atomic value{0}; return value; } -inline std::atomic& cuda_taskfn_callback_duration_ns() { +inline std::atomic& device_taskfn_callback_duration_ns() { static std::atomic value{0}; return value; } @@ -34,8 +33,8 @@ inline std::atomic& cuda_taskfn_callback_duration_ns() { namespace madness { /// -/// cudaTaskFn class -/// represent a task that calls an async cuda kernel +/// deviceTaskFn class +/// represent a task that calls an async device kernel /// the task must call synchronize_stream function to tell which stream it /// used /// @@ -44,55 +43,55 @@ template -struct cudaTaskFn : public TaskInterface { - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg1T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg2T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg3T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg4T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg5T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg6T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg7T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg8T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg9T cannot be a const " - "or reference type"); +struct deviceTaskFn : public TaskInterface { + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg1T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg2T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg3T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg4T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg5T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg6T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg7T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg8T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg9T cannot be a const " + "or reference type"); private: /// This class type - typedef cudaTaskFn - cudaTaskFn_; + typedef deviceTaskFn + deviceTaskFn_; friend class AsyncTaskInterface; - /// internal Task structure that wraps the Async cuda function + /// internal Task structure that wraps the Async device function struct AsyncTaskInterface : public madness::TaskInterface { - AsyncTaskInterface(cudaTaskFn_* task, int ndepend = 0, + AsyncTaskInterface(deviceTaskFn_* task, int ndepend = 0, const TaskAttributes attr = TaskAttributes()) : TaskInterface(ndepend, attr), task_(task) {} @@ -105,7 +104,7 @@ struct cudaTaskFn : public TaskInterface { task_->run_async(); // get the stream used by async function - auto stream = TiledArray::tls_cudastream_accessor(); + auto stream = TiledArray::device::tls_stream_accessor(); // TA_ASSERT(stream != nullptr); @@ -113,32 +112,33 @@ struct cudaTaskFn : public TaskInterface { if (stream == nullptr) { task_->notify(); } else { - // TODO should we use cuda callback or cuda events?? - // insert cuda callback - cudaLaunchHostFunc(*stream, cuda_callback, task_); + // TODO should we use device callback or device events?? + // insert device callback + TiledArray::device::launchHostFunc(*stream, device_callback, task_); // reset stream to nullptr - TiledArray::synchronize_stream(nullptr); + TiledArray::device::synchronize_stream(nullptr); } } private: - static void CUDART_CB cuda_callback(void* userData) { + static void DEVICERT_CB device_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); // convert void * to AsyncTaskInterface* - auto* callback = static_cast(userData); + auto* callback = static_cast(userData); // std::stringstream address; // address << (void*) callback; - // std::string message = "callback on cudaTaskFn: " + address.str() + + // std::string message = "callback on deviceTaskFn: " + address.str() + // + // '\n'; std::cout << message; callback->notify(); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_taskfn_callback_duration_ns() += + TiledArray::detail::device_taskfn_callback_duration_ns() += TiledArray::duration_in_ns(t0, t1); } - cudaTaskFn_* task_; + deviceTaskFn_* task_; }; public: @@ -160,7 +160,7 @@ struct cudaTaskFn : public TaskInterface { futureT result_; ///< The task Future result const functionT func_; ///< The task function TaskInterface* async_task_; ///< The internal AsyncTaskInterface that wraps - ///< the async cuda function + ///< the async device function futureT async_result_; ///< the future returned from the async task // If the value of the argument is known at the time the @@ -258,7 +258,7 @@ struct cudaTaskFn : public TaskInterface { /// Check dependencies and register callbacks where necessary void check_dependencies() { - this->inc(); // the current cudaTaskFn depends on the internal + this->inc(); // the current deviceTaskFn depends on the internal // AsyncTaskInterface, dependency = 1 check_dependency(arg1_); check_dependency(arg2_); @@ -272,13 +272,14 @@ struct cudaTaskFn : public TaskInterface { } // Copies are not allowed. - cudaTaskFn(const cudaTaskFn_&); - cudaTaskFn_ operator=(cudaTaskFn_&); + deviceTaskFn(const deviceTaskFn_&); + deviceTaskFn_ operator=(deviceTaskFn_&); public: #if MADNESS_TASKQ_VARIADICS - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -298,8 +299,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -319,8 +320,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -340,8 +341,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -361,8 +362,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -383,8 +384,8 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -405,8 +406,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -427,9 +429,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -450,9 +452,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -474,9 +476,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, - a9T&& a9, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, + a9T&& a9, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -495,8 +497,9 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr, - archive::BufferInputArchive& input_arch) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr, + archive::BufferInputArchive& input_arch) : TaskInterface(attr), result_(result), func_(func), @@ -514,7 +517,8 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } #else // MADNESS_TASKQ_VARIADICS - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -534,8 +538,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -555,8 +559,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const TaskAttributes& attr = TaskAttributes()) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const TaskAttributes& attr = TaskAttributes()) : TaskInterface(attr), result_(result), func_(func), @@ -576,8 +580,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -597,9 +601,9 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -620,9 +624,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -643,9 +647,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -666,9 +670,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -689,10 +693,10 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const a8T& a8, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const a8T& a8, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -714,10 +718,10 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -736,8 +740,9 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr, - archive::BufferInputArchive& input_arch) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr, + archive::BufferInputArchive& input_arch) : TaskInterface(attr), result_(result), func_(func), @@ -757,7 +762,7 @@ struct cudaTaskFn : public TaskInterface { #endif // MADNESS_TASKQ_VARIADICS // no need to delete async_task_, as it will be deleted by the TaskQueue - virtual ~cudaTaskFn() = default; + virtual ~deviceTaskFn() = default; const futureT& result() const { return result_; } @@ -770,16 +775,16 @@ struct cudaTaskFn : public TaskInterface { } #else protected: - /// when this cudaTaskFn gets run, it means the AsyncTaskInterface is done + /// when this deviceTaskFn gets run, it means the AsyncTaskInterface is done /// set the result with async_result_, which is finished void run(const TaskThreadEnv& env) override { result_.set(std::move(async_result_)); } #endif // HAVE_INTEL_TBB -}; // class cudaTaskFn +}; // class deviceTaskFn -/// add a cudaTaskFn object to World +/// add a deviceTaskFn object to World /// \tparam fnT A function pointer or functor /// \tparam a1T Type of argument 1. /// \tparam a2T Type of argument 2. @@ -794,15 +799,15 @@ struct cudaTaskFn : public TaskInterface { /// \return Description needed. template -typename cudaTaskFn::futureT -add_cuda_taskfn( +typename deviceTaskFn::futureT +add_device_taskfn( madness::World& world, - cudaTaskFn* t) { - typename cudaTaskFn::futureT - res(t->result()); - // add the internal async task in cuda task as well + deviceTaskFn* t) { + typename deviceTaskFn::futureT res(t->result()); + // add the internal async task in device task as well world.taskq.add(t->async_task()); - // add the cuda task + // add the device task world.taskq.add(static_cast(t)); return res; } @@ -815,13 +820,13 @@ template < typename fnT, typename... argsT, typename = std::enable_if_t::value>> typename detail::function_enabler...)>::type -add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { - /// type of cudaTaskFn object +add_device_task(madness::World& world, fnT&& fn, argsT&&... args) { + /// type of deviceTaskFn object using taskT = - cudaTaskFn, - std::remove_const_t>...>; + deviceTaskFn, + std::remove_const_t>...>; - return add_cuda_taskfn( + return add_device_taskfn( world, new taskT(typename taskT::futureT(), std::forward(fn), std::forward(args)..., TaskAttributes())); } @@ -835,13 +840,13 @@ template < typename = std::enable_if_t::value>> typename meta::drop_last_arg_and_apply_callable< detail::function_enabler, fnT, future_to_ref_t...>::type::type -add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { - /// type of cudaTaskFn object +add_device_task(madness::World& world, fnT&& fn, argsT&&... args) { + /// type of deviceTaskFn object using taskT = typename meta::drop_last_arg_and_apply< - cudaTaskFn, std::decay_t, + deviceTaskFn, std::decay_t, std::remove_const_t>...>::type; - return add_cuda_taskfn( + return add_device_taskfn( world, new taskT(typename taskT::futureT(), std::forward(fn), std::forward(args)...)); } @@ -852,14 +857,14 @@ add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { /// \tparam argsT variadic template for arguments /// \return A future to the result template -typename detail::memfunc_enabler::type add_cuda_task( +typename detail::memfunc_enabler::type add_device_task( madness::World& world, objT&& obj, memfnT memfn, argsT&&... args) { - return add_cuda_task(world, - detail::wrap_mem_fn(std::forward(obj), memfn), - std::forward(args)...); + return add_device_task(world, + detail::wrap_mem_fn(std::forward(obj), memfn), + std::forward(args)...); } } // namespace madness -#endif // TILDARRAY_HAS_CUDA +#endif // TILDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/device/kernel/mult_kernel.cu similarity index 96% rename from src/TiledArray/cuda/kernel/mult_kernel.cu rename to src/TiledArray/device/kernel/mult_kernel.cu index aa3cadbc72..ca2d86d4b9 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.cu +++ b/src/TiledArray/device/kernel/mult_kernel.cu @@ -21,8 +21,8 @@ * */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h similarity index 100% rename from src/TiledArray/cuda/kernel/mult_kernel.h rename to src/TiledArray/device/kernel/mult_kernel.h diff --git a/src/TiledArray/cuda/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/mult_kernel_impl.h similarity index 95% rename from src/TiledArray/cuda/kernel/mult_kernel_impl.h rename to src/TiledArray/device/kernel/mult_kernel_impl.h index b237dfab1e..a1c217ce3d 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel_impl.h +++ b/src/TiledArray/device/kernel/mult_kernel_impl.h @@ -34,7 +34,7 @@ namespace TiledArray { template void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( @@ -47,7 +47,7 @@ void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n, template void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2, std::size_t n, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/device/kernel/reduce_kernel.cu similarity index 98% rename from src/TiledArray/cuda/kernel/reduce_kernel.cu rename to src/TiledArray/device/kernel/reduce_kernel.cu index d24669b920..a09b3f7a41 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.cu +++ b/src/TiledArray/device/kernel/reduce_kernel.cu @@ -21,8 +21,8 @@ * */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h similarity index 100% rename from src/TiledArray/cuda/kernel/reduce_kernel.h rename to src/TiledArray/device/kernel/reduce_kernel.h diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/device/kernel/reduce_kernel_impl.h similarity index 95% rename from src/TiledArray/cuda/kernel/reduce_kernel_impl.h rename to src/TiledArray/device/kernel/reduce_kernel_impl.h index 9dc6507cca..f03e333dbb 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/device/kernel/reduce_kernel_impl.h @@ -26,7 +26,7 @@ #include -#include +#include #include #include #include @@ -57,7 +57,7 @@ struct absolute_value template T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); @@ -107,7 +107,7 @@ TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( thrust::maximum max_op; detail::absolute_value abs_op; - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); @@ -125,7 +125,7 @@ TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( thrust::minimum min_op; detail::absolute_value abs_op; - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); diff --git a/src/TiledArray/cuda/platform.h b/src/TiledArray/device/platform.h similarity index 93% rename from src/TiledArray/cuda/platform.h rename to src/TiledArray/device/platform.h index f94226b39e..9d0cac3cdd 100644 --- a/src/TiledArray/cuda/platform.h +++ b/src/TiledArray/device/platform.h @@ -31,9 +31,9 @@ enum class MemorySpace { // MemorySpace is represented as a bitfield to compute unions and // intersections easier Null = 0b00, - CPU = 0b01, - CUDA = 0b10, - CUDA_UM = CPU | CUDA // union of CPU and CUDA spaces + Host = 0b01, + Device = 0b10, + Device_UM = Host | Device // union of host and device spaces }; // customization point: in_memory_space(O) -> bool @@ -55,7 +55,7 @@ constexpr bool overlap(MemorySpace space1, MemorySpace space2) { } /// enumerates the execution spaces -enum class ExecutionSpace { CPU, CUDA }; +enum class ExecutionSpace { Host, Device }; // customization point: to_execution_space(O) -> void // "moves" O to execution space S diff --git a/src/TiledArray/cuda/thrust.h b/src/TiledArray/device/thrust.h similarity index 100% rename from src/TiledArray/cuda/thrust.h rename to src/TiledArray/device/thrust.h diff --git a/src/TiledArray/cuda/um_storage.cu b/src/TiledArray/device/um_storage.cu similarity index 66% rename from src/TiledArray/cuda/um_storage.cu rename to src/TiledArray/device/um_storage.cu index a16811e91b..cc3a1aae55 100644 --- a/src/TiledArray/cuda/um_storage.cu +++ b/src/TiledArray/device/um_storage.cu @@ -22,29 +22,29 @@ */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA namespace thrust { template<> -void resize>( - thrust::device_vector>& dev_vec, +void resize>( + thrust::device_vector>& dev_vec, size_t size) { dev_vec.resize(size); } template<> -void resize>( - thrust::device_vector>& dev_vec, +void resize>( + thrust::device_vector>& dev_vec, size_t size) { dev_vec.resize(size); } } namespace thrust { -template class device_vector>; -template class device_vector>; +template class device_vector>; +template class device_vector>; } #endif //TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/device/um_storage.h similarity index 78% rename from src/TiledArray/cuda/um_storage.h rename to src/TiledArray/device/um_storage.h index bea591cbb2..e4318e5666 100644 --- a/src/TiledArray/cuda/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -24,15 +24,15 @@ #ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED #define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED -#include -#include +#include +#include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include -#include +#include #include #include @@ -41,40 +41,41 @@ namespace TiledArray { template using cuda_um_thrust_vector = - thrust::device_vector>; + thrust::device_vector>; /// @return true if @c dev_vec is present in space @space template bool in_memory_space(const Storage& vec) noexcept { - return overlap(MemorySpace::CUDA_UM, Space); + return overlap(MemorySpace::Device_UM, Space); } /** * @tparam Space - * @tparam Storage the Storage type of the vector, such as cuda_um_btas_varray + * @tparam Storage the Storage type of the vector, such as + * device_um_btas_varray */ template void to_execution_space(Storage& vec, cudaStream_t stream = 0) { switch (Space) { - case ExecutionSpace::CPU: { + case ExecutionSpace::Host: { using std::data; using std::size; using value_type = typename Storage::value_type; - if (cudaEnv::instance()->concurrent_managed_access()) { - CudaSafeCall(cudaMemPrefetchAsync(data(vec), - size(vec) * sizeof(value_type), - cudaCpuDeviceId, stream)); + if (deviceEnv::instance()->concurrent_managed_access()) { + DeviceSafeCall(cudaMemPrefetchAsync(data(vec), + size(vec) * sizeof(value_type), + cudaCpuDeviceId, stream)); } break; } - case ExecutionSpace::CUDA: { + case ExecutionSpace::Device: { using std::data; using std::size; using value_type = typename Storage::value_type; int device = -1; - if (cudaEnv::instance()->concurrent_managed_access()) { - CudaSafeCall(cudaGetDevice(&device)); - CudaSafeCall(cudaMemPrefetchAsync( + if (deviceEnv::instance()->concurrent_managed_access()) { + DeviceSafeCall(cudaGetDevice(&device)); + DeviceSafeCall(cudaMemPrefetchAsync( data(vec), size(vec) * sizeof(value_type), device, stream)); } break; @@ -95,8 +96,8 @@ template void make_device_storage(Storage& storage, std::size_t n, const cudaStream_t& stream = 0) { storage = Storage(n); - TiledArray::to_execution_space(storage, - stream); + TiledArray::to_execution_space(storage, + stream); } /** @@ -131,7 +132,7 @@ struct ArchiveLoadImpl> { static inline void load(const Archive& ar, TiledArray::cuda_um_thrust_vector& x) { typename thrust::device_vector< - T, TiledArray::cuda_um_allocator>::size_type n(0); + T, TiledArray::device_um_allocator>::size_type n(0); ar& n; x.resize(n); for (auto& xi : x) ar& xi; diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index a4c203d3dd..fa33d74d9c 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -123,12 +123,12 @@ class BinaryEvalImpl : public DistEvalImpl, private: /// Task function for evaluating tiles -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// \param i The tile index /// \param left The left-hand tile /// \param right The right-hand tile template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, L left, R right) { DistEvalImpl_::set_tile(i, op_(left, right)); } @@ -137,11 +137,11 @@ class BinaryEvalImpl : public DistEvalImpl, /// \param left The left-hand tile /// \param right The right-hand tile template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, L left, R right) { // TODO avoid copy the Op object auto result_tile = - madness::add_cuda_task(DistEvalImpl_::world(), op_, left, right); + madness::add_device_task(DistEvalImpl_::world(), op_, left, right); DistEvalImpl_::set_tile(i, result_tile); } #else diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index ae3b456bc0..0d07821a4e 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -31,11 +31,11 @@ #include -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1 namespace TiledArray { namespace detail { @@ -479,7 +479,7 @@ class Summa static typename std::enable_if< is_lazy_tile::value #ifdef TILEDARRAY_HAS_CUDA - && !detail::is_cuda_tile_v + && !detail::is_device_tile_v #endif , Future>::type @@ -502,13 +502,14 @@ class Summa template static typename std::enable_if< is_lazy_tile::value && - detail::is_cuda_tile_v, + detail::is_device_tile_v, Future>::type get_tile(Arg& arg, const typename Arg::ordinal_type index) { auto convert_tile_fn = &Summa_::template convert_tile; - return madness::add_cuda_task(arg.world(), convert_tile_fn, arg.get(index), - madness::TaskAttributes::hipri()); + return madness::add_device_task(arg.world(), convert_tile_fn, + arg.get(index), + madness::TaskAttributes::hipri()); } #endif diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index b1056e0ac1..b45a85bda0 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -26,7 +26,7 @@ #include #include #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #endif diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index b3707b92c2..32052d2700 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -115,18 +115,18 @@ class UnaryEvalImpl /// \param i The tile index /// \param tile The tile to be evaluated template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, tile_argument_type tile) { // TODO avoid copy Op object auto result_tile = - madness::add_cuda_task(DistEvalImpl_::world(), op_, tile); + madness::add_device_task(DistEvalImpl_::world(), op_, tile); DistEvalImpl_::set_tile(i, result_tile); } /// \param i The tile index /// \param tile The tile to be evaluated template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, tile_argument_type tile) { DistEvalImpl_::set_tile(i, op_(tile)); } diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 1a7bc2ff05..d2bbe7c673 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -41,7 +41,7 @@ #include "TiledArray/tile_interface/trace.h" #include "expr_engine.h" #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #endif @@ -187,7 +187,7 @@ class Expr { typename std::enable_if::value && is_lazy_tile::value #ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I& index, const Future& tile) const { @@ -210,9 +210,9 @@ class Expr { typename std::enable_if< !std::is_same::value && is_lazy_tile::value && - ::TiledArray::detail::is_cuda_tile_v>::type* = nullptr> + ::TiledArray::detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I& index, const Future& tile) const { - array.set(index, madness::add_cuda_task( + array.set(index, madness::add_device_task( array.world(), TiledArray::Cast(), tile)); } @@ -247,7 +247,7 @@ class Expr { typename A, typename I, typename T, typename Op, typename std::enable_if::value #ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, @@ -275,14 +275,14 @@ class Expr { template ::value && - ::TiledArray::detail::is_cuda_tile_v>::type* = nullptr> + ::TiledArray::detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { auto eval_tile_fn = &Expr_::template eval_tile< typename A::value_type, const T&, TiledArray::Cast, Op>; array.set(index, - madness::add_cuda_task( + madness::add_device_task( array.world(), eval_tile_fn, tile, TiledArray::Cast(), op)); } @@ -304,7 +304,7 @@ class Expr { typename A, typename I, typename T, typename Op, typename std::enable_if::value #ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, @@ -332,7 +332,7 @@ class Expr { template ::value&& ::TiledArray:: - detail::is_cuda_tile_v>::type* = nullptr> + detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { auto eval_tile_fn_ptr = &Expr_::template eval_tile; @@ -340,8 +340,8 @@ class Expr { static_assert(madness::detail::function_traits&)>::value, "ouch"); - array.set(index, madness::add_cuda_task(array.world(), eval_tile_fn_ptr, - tile, op)); + array.set(index, madness::add_device_task(array.world(), eval_tile_fn_ptr, + tile, op)); } #endif diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index 1d169b1098..7a6e4d50e1 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -1,509 +1,3 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Chong Peng - * Department of Chemistry, Virginia Tech - * July 23, 2018 - * - */ - -#ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED -#define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED - -#include -#include -#include - -#include - -#ifdef TILEDARRAY_HAS_CUDA - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -#include - -#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__) -#define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__) -#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) - -inline void __cudaSafeCall(cudaError err, const char* file, const int line) { - if (cudaSuccess != err) { - std::stringstream ss; - ss << "cudaSafeCall() failed at: " << file << ":" << line; - std::string what = ss.str(); - throw thrust::system_error(err, thrust::cuda_category(), what); - } -} - -inline void __cudaSafeCallNoThrow(cudaError err, const char* file, - const int line) { - if (cudaSuccess != err) { - madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line); - } -} - -inline void __cudaCheckError(const char* file, const int line) { - cudaError err = cudaGetLastError(); - if (cudaSuccess != err) { - std::stringstream ss; - ss << "cudaCheckError() failed at: " << file << ":" << line; - std::string what = ss.str(); - throw thrust::system_error(err, thrust::cuda_category(), what); - } -} - -namespace TiledArray { - -namespace detail { - -inline int num_cuda_streams() { - int num_streams = -1; - char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS"); - /// default num of streams is 3 - if (num_stream_char) { - num_streams = std::atoi(num_stream_char); - } else { - num_streams = 3; - } - return num_streams; -} - -inline int num_cuda_devices() { - int num_devices = -1; - CudaSafeCall(cudaGetDeviceCount(&num_devices)); - return num_devices; -} - -inline int current_cuda_device_id(World& world) { - int mpi_local_size = -1; - int mpi_local_rank = -1; - std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); - - int num_devices = detail::num_cuda_devices(); - - int cuda_device_id = -1; - // devices may already be pre-mapped - // if mpi_local_size <= num_devices : all ranks are in same resource set, map - // round robin - if (mpi_local_size <= num_devices) { - cuda_device_id = mpi_local_rank % num_devices; - } else { // mpi_local_size > num_devices - char* cvd_cstr = std::getenv("CUDA_VISIBLE_DEVICES"); - if (cvd_cstr) { // CUDA_VISIBLE_DEVICES is set, assume that pre-mapped - // make sure that there is only 1 device available here - if (num_devices != 1) { - throw std::runtime_error( - std::string( - "CUDA_VISIBLE_DEVICES environment variable is set, hence using " - "the provided device-to-rank mapping; BUT TiledArray found ") + - std::to_string(num_devices) + - " CUDA devices; only 1 CUDA device / MPI process is supported"); - } - cuda_device_id = 0; - } else { // not enough devices + devices are not pre-mapped - throw std::runtime_error( - std::string("TiledArray found ") + std::to_string(mpi_local_size) + - " MPI ranks on a node with " + std::to_string(num_devices) + - " CUDA devices; only 1 MPI process / CUDA device model is currently " - "supported"); - } - } - - return cuda_device_id; -} - -inline void CUDART_CB cuda_readyflag_callback(void* userData) { - // convert void * to std::atomic - std::atomic* flag = static_cast*>(userData); - // set the flag to be true - flag->store(true); -} - -struct ProbeFlag { - ProbeFlag(std::atomic* f) : flag(f) {} - - bool operator()() const { return flag->load(); } - - std::atomic* flag; -}; - -inline void thread_wait_cuda_stream(const cudaStream_t& stream) { - std::atomic* flag = new std::atomic(false); - - CudaSafeCall( - cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag)); - - detail::ProbeFlag probe(flag); - - // wait with sleep and do not do work - madness::ThreadPool::await(probe, false, true); - // madness::ThreadPool::await(probe, true, true); - - delete flag; -} - -} // namespace detail - -inline const cudaStream_t*& tls_cudastream_accessor() { - static thread_local const cudaStream_t* thread_local_stream_ptr{nullptr}; - return thread_local_stream_ptr; -} - -inline void synchronize_stream(const cudaStream_t* stream) { - tls_cudastream_accessor() = stream; -} - -/** - * cudaEnv maintains the CUDA-related part of the runtime environment, - * such as CUDA-specific memory allocators - * - * \note this is a Singleton - */ -class cudaEnv { - public: - ~cudaEnv() { - // destroy cuda streams on current device - for (auto& stream : cuda_streams_) { - CudaSafeCallNoThrow(cudaStreamDestroy(stream)); - } - } - - cudaEnv(const cudaEnv&) = delete; - cudaEnv(cudaEnv&&) = delete; - cudaEnv& operator=(const cudaEnv&) = delete; - cudaEnv& operator=(cudaEnv&&) = delete; - - /// access the singleton instance; if not initialized will be - /// initialized via cudaEnv::initialize() with the default params - static std::unique_ptr& instance() { - if (!instance_accessor()) { - initialize(); - } - return instance_accessor(); - } - - // clang-format off - /// initialize the instance using explicit params - /// \param world the world to use for initialization - /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least - /// this size (bytes) [default=2^25] - /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that - /// allocator returned by `this->pinned_allocator()` can allocate; - /// this allocator is not used by default [default=0] - // clang-format on - static void initialize(World& world = TiledArray::get_default_world(), - const std::uint64_t page_size = (1ul << 25), - const std::uint64_t pinned_alloc_limit = (1ul << 40)) { - static std::mutex mtx; // to make initialize() reentrant - std::scoped_lock lock{mtx}; - // only the winner of the lock race gets to initialize - if (instance_accessor() == nullptr) { - int num_streams = detail::num_cuda_streams(); - int num_devices = detail::num_cuda_devices(); - int device_id = detail::current_cuda_device_id(world); - // set device for current MPI process .. will be set in the ctor as well - CudaSafeCall(cudaSetDevice(device_id)); - CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); - - // uncomment to debug umpire ops - // - // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel( - // umpire::util::message::Debug); - - // make Thread Safe UM Dynamic POOL - - auto& rm = umpire::ResourceManager::getInstance(); - - auto mem_total_free = cudaEnv::memory_total_and_free_device(); - - // turn off Umpire introspection for non-Debug builds -#ifndef NDEBUG - constexpr auto introspect = true; -#else - constexpr auto introspect = false; -#endif - - // allocate all currently-free memory for UM pool - auto um_dynamic_pool = - rm.makeAllocator( - "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, - pinned_alloc_limit); - - // allocate zero memory for device pool - auto dev_size_limited_alloc = - rm.makeAllocator( - "size_limited_alloc", rm.getAllocator("DEVICE"), - mem_total_free.first); - auto dev_dynamic_pool = - rm.makeAllocator( - "CUDADynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); - - // allocate pinned_alloc_limit in pinned memory - auto pinned_size_limited_alloc = - rm.makeAllocator( - "SizeLimited_PINNED", rm.getAllocator("PINNED"), - pinned_alloc_limit); - auto pinned_dynamic_pool = - rm.makeAllocator( - "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, - page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - - auto cuda_env = std::unique_ptr( - new cudaEnv(world, num_devices, device_id, num_streams, - um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); - instance_accessor() = std::move(cuda_env); - } - } - - World& world() const { return *world_; } - - int num_cuda_devices() const { return num_cuda_devices_; } - - int current_cuda_device_id() const { return current_cuda_device_id_; } - - int num_cuda_streams() const { return num_cuda_streams_; } - - bool concurrent_managed_access() const { - return cuda_device_concurrent_managed_access_; - } - - size_t stream_id(const cudaStream_t& stream) const { - auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream); - if (it == cuda_streams_.end()) abort(); - return it - cuda_streams_.begin(); - } - - /// @return the total size of all and free device memory on the current device - static std::pair memory_total_and_free_device() { - std::pair result; - // N.B. cudaMemGetInfo returns {free,total} - CudaSafeCall(cudaMemGetInfo(&result.second, &result.first)); - return result; - } - - /// Collective call to probe CUDA {total,free} memory - - /// @return the total size of all and free device memory on every rank's - /// device - std::vector> memory_total_and_free() const { - auto world_size = world_->size(); - std::vector total_memory(world_size, 0), free_memory(world_size, 0); - auto rank = world_->rank(); - std::tie(total_memory.at(rank), free_memory.at(rank)) = - cudaEnv::memory_total_and_free_device(); - world_->gop.sum(total_memory.data(), total_memory.size()); - world_->gop.sum(free_memory.data(), free_memory.size()); - std::vector> result(world_size); - for (int r = 0; r != world_size; ++r) { - result.at(r) = {total_memory.at(r), free_memory.at(r)}; - } - return result; - } - - const cudaStream_t& cuda_stream(std::size_t i) const { - return cuda_streams_.at(i); - } - - const cudaStream_t& cuda_stream_h2d() const { - return cuda_streams_[num_cuda_streams_]; - } - - const cudaStream_t& cuda_stream_d2h() const { - return cuda_streams_[num_cuda_streams_ + 1]; - } - - /// @return a (non-thread-safe) Umpire allocator for CUDA UM - umpire::Allocator& um_allocator() { return um_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by um_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()` - // clang-format on - std::size_t um_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - um_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - um_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - /// @return a (non-thread-safe) Umpire allocator for CUDA device memory - umpire::Allocator& device_allocator() { return device_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by um_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()` - // clang-format on - std::size_t device_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - device_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - device_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - /// @return an Umpire allocator that allocates from a - /// pinned memory pool - /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl - umpire::Allocator& pinned_allocator() { return pinned_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by pinned_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()` - // clang-format on - std::size_t pinned_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - pinned_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - pinned_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - protected: - cudaEnv(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc, - umpire::Allocator pinned_alloc) - : world_(&world), - um_allocator_(um_alloc), - device_allocator_(device_alloc), - pinned_allocator_(pinned_alloc), - num_cuda_devices_(num_devices), - current_cuda_device_id_(device_id), - num_cuda_streams_(num_streams) { - if (num_devices <= 0) { - throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); - } - - // set device for current MPI process - CudaSafeCall(cudaSetDevice(current_cuda_device_id_)); - - /// check the capability of CUDA device - cudaDeviceProp prop; - CudaSafeCall(cudaGetDeviceProperties(&prop, device_id)); - if (!prop.managedMemory) { - throw std::runtime_error("CUDA Device doesn't support managedMemory\n"); - } - int concurrent_managed_access; - CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access, - cudaDevAttrConcurrentManagedAccess, - device_id)); - cuda_device_concurrent_managed_access_ = concurrent_managed_access; - if (!cuda_device_concurrent_managed_access_) { - std::cout << "\nWarning: CUDA Device doesn't support " - "ConcurrentManagedAccess!\n\n"; - } - - // creates cuda streams on current device - cuda_streams_.resize(num_cuda_streams_ + 2); - for (auto& stream : cuda_streams_) { - CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - } - std::cout << "created " << num_cuda_streams_ - << " CUDA streams + 2 I/O streams" << std::endl; - } - - private: - // the world used to initialize this - World* world_; - - /// allocator backed by a (non-thread-safe) dynamically-sized pool for CUDA UM - umpire::Allocator um_allocator_; - /// allocator backed by a (non-thread-safe) dynamically-sized pool for device - /// memory - umpire::Allocator device_allocator_; - // allocates from a dynamic, size-limited pinned memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl - umpire::Allocator pinned_allocator_; - - int num_cuda_devices_; - int current_cuda_device_id_; - bool cuda_device_concurrent_managed_access_; - - int num_cuda_streams_; - std::vector cuda_streams_; - - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; - return instance_; - } -}; - -namespace detail { - -template -const cudaStream_t& get_stream_based_on_range(const Range& range) { - // TODO better way to get stream based on the id of tensor - auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams(); - auto& stream = cudaEnv::instance()->cuda_stream(stream_id); - return stream; -} - -} // namespace detail - -namespace nvidia { - -// Color definitions for nvtxcalls -enum class argbColor : uint32_t { - red = 0xFFFF0000, - blue = 0xFF0000FF, - green = 0xFF008000, - yellow = 0xFFFFFF00, - cyan = 0xFF00FFFF, - magenta = 0xFFFF00FF, - gray = 0xFF808080, - purple = 0xFF800080 -}; - -/// enter a profiling range by calling nvtxRangePushEx -/// \param[in] range_title a char string containing the range title -/// \param[in] range_color the range color -inline void range_push(const char* range_title, argbColor range_color) { - nvtxEventAttributes_t eventAttrib = {0}; - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = static_cast(range_color); - eventAttrib.message.ascii = range_title; - nvtxRangePushEx(&eventAttrib); -} - -/// exits the current profiling range by calling nvtxRangePopEx -inline void range_pop() { nvtxRangePop(); } - -} // namespace nvidia - -} // namespace TiledArray - -#endif // TILEDARRAY_HAS_CUDA - -#endif // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED +#warning \ + "This header is deprecated. Please use TiledArray/external/device.h instead." +#include diff --git a/src/TiledArray/external/hip.h b/src/TiledArray/external/device.h similarity index 53% rename from src/TiledArray/external/hip.h rename to src/TiledArray/external/device.h index 75dbfc6955..5016d55b23 100644 --- a/src/TiledArray/external/hip.h +++ b/src/TiledArray/external/device.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_EXTERNAL_HIP_H__INCLUDED -#define TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#ifndef TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED +#define TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED #include #include @@ -30,9 +30,15 @@ #include -#ifdef TILEDARRAY_HAS_HIP - +#if defined(TILEDARRAY_HAS_HIP) #include +#elif defined(TILEDARRAY_HAS_CUDA) +#include +#include +#include +#include +#include +#endif #include @@ -43,9 +49,40 @@ #include -#define HipSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) -#define HipSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) -#define HipCheckError() __hipCheckError(__FILE__, __LINE__) +#if defined(TILEDARRAY_HAS_CUDA) + +inline void __DeviceSafeCall(cudaError err, const char* file, const int line) { + if (cudaSuccess != err) { + std::stringstream ss; + ss << "DeviceSafeCall() failed at: " << file << ":" << line; + std::string what = ss.str(); + throw thrust::system_error(err, thrust::cuda_category(), what); + } +} + +inline void __cudaSafeCallNoThrow(cudaError err, const char* file, + const int line) { + if (cudaSuccess != err) { + madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line); + } +} + +inline void __cudaCheckError(const char* file, const int line) { + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) { + std::stringstream ss; + ss << "cudaCheckError() failed at: " << file << ":" << line; + std::string what = ss.str(); + throw thrust::system_error(err, thrust::cuda_category(), what); + } +} + +#define DeviceSafeCall(err) __DeviceSafeCall(err, __FILE__, __LINE__) +#define DeviceSafeCallNoThrow(err) \ + __cudaSafeCallNoThrow(err, __FILE__, __LINE__) +#define DeviceCheckError() __cudaCheckError(__FILE__, __LINE__) + +#elif defined(TILEDARRAY_HAS_HIP) inline void __hipSafeCall(hipError_t err, const char* file, const int line) { if (hipSuccess != err) { @@ -74,34 +111,158 @@ inline void __hipCheckError(const char* file, const int line) { } } +#define DeviceSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) +#define DeviceSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) +#define DeviceCheckError() __hipCheckError(__FILE__, __LINE__) + +#endif + namespace TiledArray { +namespace device { + +#if defined(TILEDARRAY_HAS_CUDA) +inline namespace cuda { +using stream_t = cudaStream_t; +using error_t = cudaError_t; +using hostFn_t = cudaHostFn_t; +using deviceProp_t = cudaDeviceProp; +using deviceAttr_t = cudaDeviceAttr; +#define DeviceAttributeConcurrentManagedAccess \ + cudaDevAttrConcurrentManagedAccess +#define DEVICERT_CB CUDART_CB + +enum FuncCache { + FuncCachePreferNone = cudaFuncCachePreferNone, + FuncCachePreferShared = cudaFuncCachePreferShared, + FuncCachePreferL1 = cudaFuncCachePreferL1, + FuncCachePreferEqual = cudaFuncCachePreferEqual +}; -namespace detail { +enum StreamCreateFlags { + StreamDefault = cudaStreamDefault, + StreamNonBlocking = cudaStreamNonBlocking +}; + +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return cudaLaunchHostFunc(stream, fn, userData); +} +inline error_t streamDestroy(stream_t stream) { + return cudaStreamDestroy(stream); +} +inline error_t setDevice(int device) { return device::setDevice(device); } +inline error_t deviceSetCacheConfig(FuncCache cache_config) { + return cudaDeviceSetCacheConfig(static_cast(cache_config)); +} +inline error_t memGetInfo(size_t* free, size_t* total) { + return cudaMemGetInfo(free, total); +} +inline error_t getDeviceProperties(deviceProp_t* prop, int device) { + return cudaGetDeviceProperties(prop, device); +} +inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { + return cudaDeviceGetAttribute(value, attr, device); +} +inline error_t streamCreateWithFlags(stream_t* pStream, + StreamCreateFlags flags) { + return cudaStreamCreateWithFlags(pStream, flags); +} +inline error_t deviceSynchronize() { return cudaDeviceSynchronize(); } +inline error_t streamSynchronize(stream_t stream) { + return cudaStreamSynchronize(stream); +} +} // namespace cuda +#elif defined(TILEDARRAY_HAS_HIP) +inline namespace hip { +using stream_t = hipStream_t; +using error_t = hipError_t; +using hostFn_t = hipHostFn_t; +using deviceProp_t = hipDeviceProp; +using deviceAttr_t = hipDeviceAttr; +#define DeviceAttributeConcurrentManagedAccess \ + hipDeviceAttributeConcurrentManagedAccess +#define DEVICERT_CD HIPRT_CB + +enum FuncCache { + FuncCachePreferNone = hipFuncCachePreferNone, + FuncCachePreferShared = hipFuncCachePreferShared, + FuncCachePreferL1 = hipFuncCachePreferL1, + FuncCachePreferEqual = hipFuncCachePreferEqual +}; + +enum StreamCreateFlags { + StreamDefault = hipStreamDefault, + StreamNonBlocking = hipStreamNonBlocking +}; + +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return hipLaunchHostFunc(stream, fn, userData); +} +inline error_t streamDestroy(stream_t stream) { + return hipStreamDestroy(stream); +} +inline error_t setDevice(int device) { return hipSetDevice(device); } +inline error_t deviceSetCacheConfig(FuncCache cache_config) { + return hipDeviceSetCacheConfig(static_cast(cache_config)); +} +inline error_t memGetInfo(size_t* free, size_t* total) { + return hipMemGetInfo(free, total); +} +inline error_t getDeviceProperties(deviceProp* prop, int device) { + return hipGetDeviceProperties(prop, device); +} +inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { + return hipDeviceGetAttribute(value, attr, device); +} +inline error_t streamCreateWithFlags(stream_t* pStream, + StreamCreateFlags flags) { + return hipStreamCreateWithFlags(pStream, flags); +} +inline error_t deviceSynchronize() { return hipDeviceSynchronize(); } +inline error_t streamSynchronize(stream_t stream) { + return hipStreamSynchronize(stream); +} +} // namespace hip +#endif + +#ifdef TILEDARRAY_HAS_DEVICE inline int num_streams() { int num_streams = -1; - char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); - /// default num of streams is 3 + char* num_stream_char = std::getenv("TA_DEVICE_NUM_STREAMS"); if (num_stream_char) { num_streams = std::atoi(num_stream_char); } else { - num_streams = 3; +#if defined(TILEDARRAY_HAS_CUDA) + char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS"); +#elif defined(TILEDARRAY_HAS_HIP) + char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); +#endif + if (num_stream_char) { + num_streams = std::atoi(num_stream_char); + } else { + /// default num of streams is 3 + num_streams = 3; + } } return num_streams; } inline int num_devices() { int num_devices = -1; - HipSafeCall(hipGetDeviceCount(&num_devices)); +#if defined(TILEDARRAY_HAS_CUDA) + DeviceSafeCall(cudaGetDeviceCount(&num_devices)); +#elif defined(TILEDARRAY_HAS_HIP) + DeviceSafeCall(hipGetDeviceCount(&num_devices)); +#endif return num_devices; } inline int current_device_id(World& world) { int mpi_local_size = -1; int mpi_local_rank = -1; - std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); + std::tie(mpi_local_rank, mpi_local_size) = detail::mpi_local_rank_size(world); - int num_devices = detail::num_devices(); + int num_devices = device::num_devices(); int device_id = -1; // devices may already be pre-mapped @@ -110,23 +271,25 @@ inline int current_device_id(World& world) { if (mpi_local_size <= num_devices) { device_id = mpi_local_rank % num_devices; } else { // mpi_local_size > num_devices - char* cvd_cstr = std::getenv("HIP_VISIBLE_DEVICES"); - if (cvd_cstr) { // HIP_VISIBLE_DEVICES is set, assume that pre-mapped + const char* vd_cstr = + std::getenv(TILEDARRAY_DEVICE_RUNTIME_STR "_VISIBLE_DEVICES"); + if (vd_cstr) { // *_VISIBLE_DEVICES is set, assume that pre-mapped // make sure that there is only 1 device available here if (num_devices != 1) { throw std::runtime_error( std::string( - "HIP_VISIBLE_DEVICES environment variable is set, hence using " + TILEDARRAY_DEVICE_RUNTIME_STR + "_VISIBLE_DEVICES environment variable is set, hence using " "the provided device-to-rank mapping; BUT TiledArray found ") + std::to_string(num_devices) + - " HIP devices; only 1 HIP device / MPI process is supported"); + " devices; only 1 device / MPI process is supported"); } device_id = 0; } else { // not enough devices + devices are not pre-mapped throw std::runtime_error( std::string("TiledArray found ") + std::to_string(mpi_local_size) + " MPI ranks on a node with " + std::to_string(num_devices) + - " HIP devices; only 1 MPI process / HIP device model is currently " + " devices; only 1 MPI process / device model is currently " "supported"); } } @@ -134,7 +297,7 @@ inline int current_device_id(World& world) { return device_id; } -inline void HIPRT_CB hip_readyflag_callback(void* userData) { +inline void DEVICERT_CB readyflag_callback(void* userData) { // convert void * to std::atomic std::atomic* flag = static_cast*>(userData); // set the flag to be true @@ -149,12 +312,12 @@ struct ProbeFlag { std::atomic* flag; }; -inline void thread_wait_stream(const hipStream_t& stream) { +inline void thread_wait_stream(const stream_t& stream) { std::atomic* flag = new std::atomic(false); - HipSafeCall(hipLaunchHostFunc(stream, detail::hip_readyflag_callback, flag)); + DeviceSafeCall(launchHostFunc(stream, readyflag_callback, flag)); - detail::ProbeFlag probe(flag); + ProbeFlag probe(flag); // wait with sleep and do not do work madness::ThreadPool::await(probe, false, true); @@ -163,40 +326,38 @@ inline void thread_wait_stream(const hipStream_t& stream) { delete flag; } -} // namespace detail - -inline const hipStream_t*& tls_stream_accessor() { - static thread_local const hipStream_t* thread_local_stream_ptr{nullptr}; +inline const stream_t*& tls_stream_accessor() { + static thread_local const stream_t* thread_local_stream_ptr{nullptr}; return thread_local_stream_ptr; } -inline void synchronize_stream(const hipStream_t* stream) { +inline void synchronize_stream(const stream_t* stream) { tls_stream_accessor() = stream; } /** - * hipEnv maintains the HIP-related part of the runtime environment, - * such as HIP-specific memory allocators + * Env maintains the device-related part of the runtime environment, + * such as specialized data structures like device streams and memory allocators * * \note this is a Singleton */ -class hipEnv { +class Env { public: - ~hipEnv() { + ~Env() { // destroy streams on current device for (auto& stream : streams_) { - HipSafeCallNoThrow(hipStreamDestroy(stream)); + DeviceSafeCallNoThrow(streamDestroy(stream)); } } - hipEnv(const hipEnv&) = delete; - hipEnv(hipEnv&&) = delete; - hipEnv& operator=(const hipEnv&) = delete; - hipEnv& operator=(hipEnv&&) = delete; + Env(const Env&) = delete; + Env(Env&&) = delete; + Env& operator=(const Env&) = delete; + Env& operator=(Env&&) = delete; /// access the singleton instance; if not initialized will be - /// initialized via hipEnv::initialize() with the default params - static std::unique_ptr& instance() { + /// initialized via Env::initialize() with the default params + static std::unique_ptr& instance() { if (!instance_accessor()) { initialize(); } @@ -219,12 +380,12 @@ class hipEnv { std::scoped_lock lock{mtx}; // only the winner of the lock race gets to initialize if (instance_accessor() == nullptr) { - int num_streams = detail::num_streams(); - int num_devices = detail::num_devices(); - int device_id = detail::current_device_id(world); + int num_streams = device::num_streams(); + int num_devices = device::num_devices(); + int device_id = device::current_device_id(world); // set device for current MPI process .. will be set in the ctor as well - HipSafeCall(hipSetDevice(device_id)); - HipSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + DeviceSafeCall(setDevice(device_id)); + DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared)); // uncomment to debug umpire ops // @@ -235,7 +396,7 @@ class hipEnv { auto& rm = umpire::ResourceManager::getInstance(); - auto mem_total_free = hipEnv::memory_total_and_free_device(); + auto mem_total_free = Env::memory_total_and_free_device(); // turn off Umpire introspection for non-Debug builds #ifndef NDEBUG @@ -257,7 +418,8 @@ class hipEnv { mem_total_free.first); auto dev_dynamic_pool = rm.makeAllocator( - "HIPDynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); + "DEVICEDynamicPool", dev_size_limited_alloc, 0, + pinned_alloc_limit); // allocate pinned_alloc_limit in pinned memory auto pinned_size_limited_alloc = @@ -269,10 +431,10 @@ class hipEnv { "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto hip_env = std::unique_ptr( - new hipEnv(world, num_devices, device_id, num_streams, - um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); - instance_accessor() = std::move(hip_env); + auto env = std::unique_ptr( + new Env(world, num_devices, device_id, num_streams, um_dynamic_pool, + dev_dynamic_pool, pinned_dynamic_pool)); + instance_accessor() = std::move(env); } } @@ -288,7 +450,7 @@ class hipEnv { return device_concurrent_managed_access_; } - size_t stream_id(const hipStream_t& stream) const { + size_t stream_id(const stream_t& stream) const { auto it = std::find(streams_.begin(), streams_.end(), stream); if (it == streams_.end()) abort(); return it - streams_.begin(); @@ -297,12 +459,12 @@ class hipEnv { /// @return the total size of all and free device memory on the current device static std::pair memory_total_and_free_device() { std::pair result; - // N.B. hipMemGetInfo returns {free,total} - HipSafeCall(hipMemGetInfo(&result.second, &result.first)); + // N.B. *MemGetInfo returns {free,total} + DeviceSafeCall(memGetInfo(&result.second, &result.first)); return result; } - /// Collective call to probe HIP {total,free} memory + /// Collective call to probe device {total,free} memory /// @return the total size of all and free device memory on every rank's /// device @@ -311,7 +473,7 @@ class hipEnv { std::vector total_memory(world_size, 0), free_memory(world_size, 0); auto rank = world_->rank(); std::tie(total_memory.at(rank), free_memory.at(rank)) = - hipEnv::memory_total_and_free_device(); + Env::memory_total_and_free_device(); world_->gop.sum(total_memory.data(), total_memory.size()); world_->gop.sum(free_memory.data(), free_memory.size()); std::vector> result(world_size); @@ -321,11 +483,11 @@ class hipEnv { return result; } - const hipStream_t& stream(std::size_t i) const { return streams_.at(i); } + const stream_t& stream(std::size_t i) const { return streams_.at(i); } - const hipStream_t& stream_h2d() const { return streams_[num_streams_]; } + const stream_t& stream_h2d() const { return streams_[num_streams_]; } - const hipStream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } + const stream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } /// @return a (non-thread-safe) Umpire allocator for device UM umpire::Allocator& um_allocator() { return um_allocator_; } @@ -362,7 +524,7 @@ class hipEnv { /// @return an Umpire allocator that allocates from a /// pinned memory pool /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl + /// wrapped into umpire_based_allocator_impl umpire::Allocator& pinned_allocator() { return pinned_allocator_; } // clang-format off @@ -379,9 +541,9 @@ class hipEnv { } protected: - hipEnv(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc, - umpire::Allocator pinned_alloc) + Env(World& world, int num_devices, int device_id, int num_streams, + umpire::Allocator um_alloc, umpire::Allocator device_alloc, + umpire::Allocator pinned_alloc) : world_(&world), um_allocator_(um_alloc), device_allocator_(device_alloc), @@ -390,34 +552,38 @@ class hipEnv { current_device_id_(device_id), num_streams_(num_streams) { if (num_devices <= 0) { - throw std::runtime_error("No HIP-Enabled GPUs Found!\n"); + throw std::runtime_error("No " TILEDARRAY_DEVICE_RUNTIME_STR + " compute devices found!\n"); } // set device for current MPI process - HipSafeCall(hipSetDevice(current_device_id_)); + DeviceSafeCall(setDevice(current_device_id_)); - /// check the capability of HIP device - hipDeviceProp prop; - HipSafeCall(hipGetDeviceProperties(&prop, device_id)); + /// check the capability of device + deviceProp_t prop; + DeviceSafeCall(getDeviceProperties(&prop, device_id)); if (!prop.managedMemory) { - throw std::runtime_error("HIP Device doesn't support managedMemory\n"); + throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR + "device doesn't support managedMemory\n"); } int concurrent_managed_access; - HipSafeCall(hipDeviceGetAttribute(&concurrent_managed_access, - hipDeviceAttributeConcurrentManagedAccess, + DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access, + DeviceAttributeConcurrentManagedAccess, device_id)); device_concurrent_managed_access_ = concurrent_managed_access; if (!device_concurrent_managed_access_) { - std::cout << "\nWarning: HIP Device doesn't support " + std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR + " device doesn't support " "ConcurrentManagedAccess!\n\n"; } // creates streams on current device streams_.resize(num_streams_ + 2); for (auto& stream : streams_) { - HipSafeCall(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking)); } - std::cout << "created " << num_streams_ << " HIP streams + 2 I/O streams" + std::cout << "created " << num_streams_ + << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams + 2 I/O streams" << std::endl; } @@ -431,7 +597,7 @@ class hipEnv { /// memory umpire::Allocator device_allocator_; // allocates from a dynamic, size-limited pinned memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator pinned_allocator_; int num_devices_; @@ -439,28 +605,63 @@ class hipEnv { bool device_concurrent_managed_access_; int num_streams_; - std::vector streams_; + std::vector streams_; - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; return instance_; } }; -namespace detail { +} // namespace device +namespace detail { template -const hipStream_t& get_stream_based_on_range(const Range& range) { +const device::stream_t& get_stream_based_on_range(const Range& range) { // TODO better way to get stream based on the id of tensor - auto stream_id = range.offset() % hipEnv::instance()->num_streams(); - auto& stream = hipEnv::instance()->stream(stream_id); + auto stream_id = range.offset() % device::Env::instance()->num_streams(); + auto& stream = device::Env::instance()->stream(stream_id); return stream; } - } // namespace detail -} // namespace TiledArray +#endif // TILEDARRAY_HAS_DEVICE + +#ifdef TILEDARRAY_HAS_CUDA +namespace nvidia { + +// Color definitions for nvtxcalls +enum class argbColor : uint32_t { + red = 0xFFFF0000, + blue = 0xFF0000FF, + green = 0xFF008000, + yellow = 0xFFFFFF00, + cyan = 0xFF00FFFF, + magenta = 0xFFFF00FF, + gray = 0xFF808080, + purple = 0xFF800080 +}; + +/// enter a profiling range by calling nvtxRangePushEx +/// \param[in] range_title a char string containing the range title +/// \param[in] range_color the range color +inline void range_push(const char* range_title, argbColor range_color) { + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = static_cast(range_color); + eventAttrib.message.ascii = range_title; + nvtxRangePushEx(&eventAttrib); +} -#endif // TILEDARRAY_HAS_HIP +/// exits the current profiling range by calling nvtxRangePopEx +inline void range_pop() { nvtxRangePop(); } + +} // namespace nvidia +#endif // #ifdef TILEDARRAY_HAS_DEVICE + +} // namespace TiledArray -#endif // TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#endif // TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index 71508226a4..e8d0d48632 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -71,7 +71,7 @@ std::mutex MutexLock::mtx_; /// \tparam StaticLock a type providing static `lock()` and `unlock()` methods ; /// defaults to NullLock which does not lock template -class umpire_allocator_impl { +class umpire_based_allocator_impl { public: using value_type = T; using pointer = value_type*; @@ -89,12 +89,12 @@ class umpire_allocator_impl { typename std::pointer_traits::difference_type; using size_type = std::make_unsigned_t; - umpire_allocator_impl(umpire::Allocator* umpalloc) noexcept + umpire_based_allocator_impl(umpire::Allocator* umpalloc) noexcept : umpalloc_(umpalloc) {} template - umpire_allocator_impl( - const umpire_allocator_impl& rhs) noexcept + umpire_based_allocator_impl( + const umpire_based_allocator_impl& rhs) noexcept : umpalloc_(rhs.umpalloc_) {} /// allocates memory using umpire dynamic pool @@ -140,17 +140,19 @@ class umpire_allocator_impl { private: umpire::Allocator* umpalloc_; -}; // class umpire_allocator_impl +}; // class umpire_based_allocator_impl template -bool operator==(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +bool operator==( + const umpire_based_allocator_impl& lhs, + const umpire_based_allocator_impl& rhs) noexcept { return lhs.umpire_allocator() == rhs.umpire_allocator(); } template -bool operator!=(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +bool operator!=( + const umpire_based_allocator_impl& lhs, + const umpire_based_allocator_impl& rhs) noexcept { return !(lhs == rhs); } @@ -195,23 +197,23 @@ namespace archive { template struct ArchiveLoadImpl> { + TiledArray::umpire_based_allocator_impl> { static inline void load( const Archive& ar, - TiledArray::umpire_allocator_impl& allocator) { + TiledArray::umpire_based_allocator_impl& allocator) { std::string allocator_name; ar& allocator_name; - allocator = TiledArray::umpire_allocator_impl( + allocator = TiledArray::umpire_based_allocator_impl( umpire::ResourceManager::getInstance().getAllocator(allocator_name)); } }; template -struct ArchiveStoreImpl> { +struct ArchiveStoreImpl< + Archive, TiledArray::umpire_based_allocator_impl> { static inline void store( const Archive& ar, - const TiledArray::umpire_allocator_impl& allocator) { + const TiledArray::umpire_based_allocator_impl& allocator) { ar& allocator.umpire_allocator()->getName(); } }; diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 6c364be113..7f411eaeba 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -81,13 +81,14 @@ typedef Tensor TensorL; typedef Tensor> TensorZ; typedef Tensor> TensorC; -// CUDA tensor -#ifdef TILEDARRAY_HAS_CUDA - -class cudaEnv; +#ifdef TILEDARRAY_HAS_DEVICE +namespace device { +class Env; +} +using deviceEnv = device::Env; template -class cuda_allocator_impl; +class umpire_based_allocator; template > class default_init_allocator; @@ -100,32 +101,32 @@ template class MutexLock; } // namespace detail -/// pooled thread-safe CUDA UM allocator +/// pooled thread-safe unified memory (UM) allocator for device computing template -using cuda_um_allocator = - default_init_allocator, - detail::get_um_allocator>>; +using device_um_allocator = default_init_allocator< + T, umpire_based_allocator, + detail::get_um_allocator>>; -/// pooled thread-safe CUDA-based pinned host memory allocator +/// pooled thread-safe pinned host memory allocator for device computing template -using cuda_pinned_allocator = - default_init_allocator, - detail::get_pinned_allocator>>; +using device_pinned_allocator = default_init_allocator< + T, umpire_based_allocator, + detail::get_pinned_allocator>>; -/// \brief a vector that lives in CUDA Unified Memory, with most operations +/// \brief a vector that lives in UM, with most operations /// implemented on the CPU template -using cuda_um_btas_varray = ::btas::varray>; +using device_um_btas_varray = + ::btas::varray>; /** - * btas::Tensor with UM storage cuda_um_btas_varray + * btas::Tensor with UM storage device_um_btas_varray */ template using btasUMTensorVarray = - ::btas::Tensor>; + ::btas::Tensor>; -#endif +#endif // TILEDARRAY_HAS_DEVICE template class Tile; diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h index dbb8f53b55..a22613fb38 100644 --- a/src/TiledArray/host/allocator.h +++ b/src/TiledArray/host/allocator.h @@ -39,9 +39,9 @@ namespace TiledArray { /// pooled, thread-safe allocator for host memory template class host_allocator_impl - : public umpire_allocator_impl> { + : public umpire_based_allocator_impl> { public: - using base_type = umpire_allocator_impl>; + using base_type = umpire_based_allocator_impl>; using typename base_type::const_pointer; using typename base_type::const_reference; using typename base_type::pointer; @@ -53,9 +53,8 @@ class host_allocator_impl template host_allocator_impl(const host_allocator_impl& rhs) noexcept - : base_type(static_cast< - const umpire_allocator_impl>&>( - rhs)) {} + : base_type(static_cast>&>(rhs)) {} template friend bool operator==(const host_allocator_impl& lhs, diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index 5852cf6a20..1b3c4f277f 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -114,7 +114,7 @@ class hostEnv { /// @return an Umpire allocator that allocates from a /// host memory pool /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl + /// wrapped into umpire_based_allocator_impl umpire::Allocator& host_allocator() { return host_allocator_; } // clang-format off @@ -139,7 +139,7 @@ class hostEnv { World* world_; // allocates from a dynamic, size-limited host memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator host_allocator_; inline static std::unique_ptr& instance_accessor() { diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h index 753ac5df58..36cd7cf93f 100644 --- a/src/TiledArray/reduce_task.h +++ b/src/TiledArray/reduce_task.h @@ -24,8 +24,8 @@ #include #include -#ifdef TILEDARRAY_HAS_CUDA -#include +#ifdef TILEDARRAY_HAS_DEVICE +#include #include #include #include @@ -304,9 +304,10 @@ class ReduceTask { }; // class ReduceObject -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE - static void CUDART_CB cuda_reduceobject_delete_callback(void* userData) { + static void DEVICERT_CB + device_reduceobject_delete_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -338,11 +339,11 @@ class ReduceTask { world->taskq.add(destroy_vector, objects, TaskAttributes::hipri()); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<0>() += + TiledArray::detail::device_callback_duration_ns<0>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB cuda_dependency_dec_callback(void* userData) { + static void DEVICERT_CB device_dependency_dec_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -361,12 +362,12 @@ class ReduceTask { // " call 2\n"; const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<1>() += + TiledArray::detail::device_callback_duration_ns<1>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB - cuda_dependency_dec_reduceobject_delete_callback(void* userData) { + static void DEVICERT_CB + device_dependency_dec_reduceobject_delete_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -399,11 +400,11 @@ class ReduceTask { delete objects; const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<2>() += + TiledArray::detail::device_callback_duration_ns<2>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB cuda_readyresult_reset_callback(void* userData) { + static void DEVICERT_CB device_readyresult_reset_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -429,7 +430,7 @@ class ReduceTask { world->taskq.add(reset, objects, TaskAttributes::hipri()); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<3>() += + TiledArray::detail::device_callback_duration_ns<3>() += TiledArray::duration_in_ns(t0, t1); } @@ -459,8 +460,8 @@ class ReduceTask { op_(*result, ready_object->arg()); // cleanup the argument -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); /// non-CUDA op if (stream_ptr == nullptr) { @@ -471,12 +472,12 @@ class ReduceTask { (*callback_object)[0] = &world_; (*callback_object)[1] = this; (*callback_object)[2] = ready_object; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_reduceobject_delete_callback, + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(cudaLaunchHostFunc( + *stream_ptr, device_dependency_dec_reduceobject_delete_callback, callback_object)); - synchronize_stream(nullptr); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " // add 3\n"; } @@ -494,8 +495,8 @@ class ReduceTask { op_(*result, *ready_result); // cleanup the result -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ready_result.reset(); } else { @@ -504,11 +505,12 @@ class ReduceTask { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = ready_result_heap; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_readyresult_reset_callback, callback_object)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(cudaLaunchHostFunc(*stream_ptr, + device_readyresult_reset_callback, + callback_object)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " // add 4\n"; } @@ -534,19 +536,19 @@ class ReduceTask { op_(*result, object->arg()); // Cleanup the argument -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ReduceObject::destroy(object); } else { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = const_cast(object); - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_reduceobject_delete_callback, callback_object)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(cudaLaunchHostFunc( + *stream_ptr, device_reduceobject_delete_callback, callback_object)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -557,16 +559,16 @@ class ReduceTask { // Decrement the dependency counter for the argument. This must // be done after the reduce call to avoid a race condition. -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE if (stream_ptr == nullptr) { this->dec(); } else { auto callback_object2 = new std::vector(1); (*callback_object2)[0] = this; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_callback, callback_object2)); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } #else @@ -585,8 +587,8 @@ class ReduceTask { op_(*result, object2->arg()); // Cleanup arguments -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ReduceObject::destroy(object1); ReduceObject::destroy(object2); @@ -595,11 +597,12 @@ class ReduceTask { (*callback_object1)[0] = &world_; (*callback_object1)[1] = const_cast(object1); (*callback_object1)[2] = const_cast(object2); - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_reduceobject_delete_callback, callback_object1)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(cudaLaunchHostFunc(*stream_ptr, + device_reduceobject_delete_callback, + callback_object1)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -612,7 +615,7 @@ class ReduceTask { // Decrement the dependency counter for the two arguments. This // must be done after the reduce call to avoid a race condition. -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE if (stream_ptr == nullptr) { this->dec(); this->dec(); @@ -620,10 +623,10 @@ class ReduceTask { auto callback_object2 = new std::vector(2); (*callback_object2)[0] = this; (*callback_object2)[1] = this; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_callback, callback_object2)); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(cudaLaunchHostFunc( + *stream_ptr, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } @@ -633,13 +636,13 @@ class ReduceTask { #endif } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE template - std::enable_if_t, void> internal_run( + std::enable_if_t, void> internal_run( const madness::TaskThreadEnv&) { TA_ASSERT(ready_result_); - auto post_result = madness::add_cuda_task(world_, op_, *ready_result_); + auto post_result = madness::add_device_task(world_, op_, *ready_result_); result_.set(post_result); if (callback_) { @@ -648,7 +651,7 @@ class ReduceTask { } template - std::enable_if_t, void> + std::enable_if_t, void> #else void #endif diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 2903e5e7f7..413b784d22 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -317,17 +317,17 @@ constexpr const bool is_reduce_op_v = /// detect cuda tile #ifdef TILEDARRAY_HAS_CUDA template -struct is_cuda_tile : public std::false_type {}; +struct is_device_tile : public std::false_type {}; template -struct is_cuda_tile> : public is_cuda_tile {}; +struct is_device_tile> : public is_device_tile {}; template -struct is_cuda_tile> - : public is_cuda_tile::eval_type> {}; +struct is_device_tile> + : public is_device_tile::eval_type> {}; template -static constexpr const auto is_cuda_tile_v = is_cuda_tile::value; +static constexpr const auto is_device_tile_v = is_device_tile::value; #endif diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 47ccc00d8e..51cfc02825 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -6,9 +6,11 @@ #include +#ifdef TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA -#include -#include +#include +#endif +#include #include #endif @@ -22,28 +24,32 @@ namespace TiledArray { namespace { -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// initialize cuda environment -inline void cuda_initialize() { - /// initialize cudaGlobal - cudaEnv::instance(); - // +inline void device_initialize() { + /// initialize deviceEnv + deviceEnv::instance(); +#if defined(TILEDARRAY_HAS_CUDA) cuBLASHandlePool::handle(); +#endif // initialize LibreTT librettInitialize(); } /// finalize cuda environment -inline void cuda_finalize() { - CudaSafeCall(cudaDeviceSynchronize()); +inline void device_finalize() { + DeviceSafeCall(device::deviceSynchronize()); librettFinalize(); +#if defined(TILEDARRAY_HAS_CUDA) cublasDestroy(cuBLASHandlePool::handle()); delete &cuBLASHandlePool::handle(); - // although TA::cudaEnv is a singleton, must explicitly delete it so - // that CUDA runtime is not finalized before the cudaEnv dtor is called - cudaEnv::instance().reset(nullptr); -} #endif + // although TA::deviceEnv is a singleton, must explicitly delete it so + // that the device runtime is not finalized before the deviceEnv dtor is + // called + deviceEnv::instance().reset(nullptr); +} +#endif // TILEDARRAY_HAS_DEVICE inline bool& initialized_madworld_accessor() { static bool flag = false; @@ -102,8 +108,8 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, ? madness::initialize(argc, argv, comm, quiet) : *madness::World::find_instance(comm); TiledArray::set_default_world(default_world); -#ifdef TILEDARRAY_HAS_CUDA - TiledArray::cuda_initialize(); +#ifdef TILEDARRAY_HAS_DEVICE + TiledArray::device_initialize(); #endif TiledArray::max_threads = TiledArray::get_num_threads(); TiledArray::set_num_threads(1); @@ -164,8 +170,8 @@ void TiledArray::finalize() { TiledArray::set_num_threads(TiledArray::max_threads); TiledArray::get_default_world().gop.fence(); // this should ensure no pending // tasks using cuda allocators -#ifdef TILEDARRAY_HAS_CUDA - TiledArray::cuda_finalize(); +#ifdef TILEDARRAY_HAS_DEVICE + TiledArray::device_finalize(); #endif if (initialized_madworld()) { madness::finalize(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 217c522018..d17dd8345b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -102,8 +102,12 @@ set(ta_test_src_files ta_test.cpp cp.cpp ) +if(CUDA_FOUND OR HIP_FOUND) + list(APPEND ta_test_src_files librett.cpp) +endif() + if(CUDA_FOUND) - list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp) + list(APPEND ta_test_src_files expressions_cuda_um.cpp tensor_um.cpp) endif() # if using C++20 must use Boost 1.74 or later: diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp index a17b749789..29408c27c2 100644 --- a/tests/expressions_cuda_um.cpp +++ b/tests/expressions_cuda_um.cpp @@ -27,7 +27,7 @@ #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #include #include "unit_test_config.h" @@ -76,8 +76,8 @@ struct UMExpressionsFixture : public TiledRangeFixture { static UMTensor permute_fn(const madness::Future& tensor_f, const Permutation& perm) { - return madness::add_cuda_task(*GlobalFixture::world, permute_task, tensor_f, - perm) + return madness::add_device_task(*GlobalFixture::world, permute_task, + tensor_f, perm) .get(); } diff --git a/tests/librett.cpp b/tests/librett.cpp index 3785071071..ced23239fd 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -22,9 +22,9 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include "unit_test_config.h" struct LibreTTFixture { @@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { TiledArray::permutation_to_col_major(perm); librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; status = @@ -118,7 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({B, A}); @@ -177,7 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({A, B, C}); @@ -297,7 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({A, A}); @@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({B, A}); @@ -397,7 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; // b(k,i,j) = a(i,j,k) diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp index 33efbfd7d4..d860b7c813 100644 --- a/tests/tensor_um.cpp +++ b/tests/tensor_um.cpp @@ -18,7 +18,7 @@ * Chong Peng on 9/19/18. */ -#include +#include #include "global_fixture.h" #include "unit_test_config.h"