Skip to content

Commit

Permalink
phase 1 of cuda -> device conversion
Browse files Browse the repository at this point in the history
- cuda.h -> device.h
- {cuda,hip} -> device for many API elements
  • Loading branch information
evaleev committed Sep 18, 2023
1 parent e03b17d commit 4ae7959
Show file tree
Hide file tree
Showing 45 changed files with 1,084 additions and 1,344 deletions.
2 changes: 1 addition & 1 deletion examples/cuda/cuda_librett.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#ifdef TILEDARRAY_HAS_CUDA

#include <TiledArray/cuda/btas_um_tensor.h>
#include <TiledArray/device/btas_um_tensor.h>
#include <tiledarray.h>

#include <iostream>
Expand Down
12 changes: 6 additions & 6 deletions examples/cuda/cuda_task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Created by Chong Peng on 11/14/18.
//

#include <TiledArray/cuda/btas_um_tensor.h>
#include <TiledArray/cuda/cuda_task_fn.h>
#include <TiledArray/device/btas_um_tensor.h>
#include <TiledArray/device/device_task_fn.h>
#include <tiledarray.h>

using value_type = double;
Expand All @@ -28,8 +28,8 @@ void verify(const tile_type& tile, value_type value, std::size_t index) {

tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream,
std::size_t index) {
CudaSafeCall(
cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()));
DeviceSafeCall(device::setDevice(
TiledArray::deviceEnv::instance()->current_device_id()));
/// make result Tensor
using Storage = typename tile_type::tensor_type::storage_type;
Storage result_storage;
Expand Down Expand Up @@ -81,7 +81,7 @@ void process_task(madness::World* world,
tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*,
std::size_t) = &::scale;

madness::Future<tile_type> scale_future = madness::add_cuda_task(
madness::Future<tile_type> scale_future = madness::add_device_task(
*world, ::scale, tensor, scale_factor, &stream, ntask * iter + i);

/// this should start until scale_taskfn is finished
Expand All @@ -98,7 +98,7 @@ int try_main(int argc, char** argv) {
std::vector<cudaStream_t> streams(n_stream);
for (auto& stream : streams) {
// create the streams
CudaSafeCall(cudaStreamCreate(&stream));
DeviceSafeCall(cudaStreamCreate(&stream));
// std::cout << "stream: " << stream << "\n";
}

Expand Down
4 changes: 2 additions & 2 deletions examples/cuda/ta_cc_abcd_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*
*/

#include <TiledArray/cuda/btas_um_tensor.h>
#include <TiledArray/device/btas_um_tensor.h>
#include <TiledArray/version.h>
#include <tiledarray.h>
#include <iostream>
Expand Down Expand Up @@ -186,7 +186,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
std::pow(n_uocc, 4) / std::pow(1024., 3);

using CUDATile =
btas::Tensor<T, TA::Range, TiledArray::cuda_um_btas_varray<T>>;
btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;

// Construct tensors
Expand Down
42 changes: 21 additions & 21 deletions examples/cuda/ta_dense_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
// clang-format off

#include <tiledarray.h>
#include <TiledArray/cuda/btas_um_tensor.h>
#include "TiledArray/cuda/cpu_cuda_vector.h"
#include <TiledArray/device/btas_um_tensor.h>
#include "TiledArray/device/cpu_cuda_vector.h"
#include <TiledArray/external/btas.h>
// clang-format on

Expand Down Expand Up @@ -98,7 +98,7 @@ void to_host(
// do norm on GPU
auto tile_norm = norm(tile.tensor());

TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
tile.tensor().storage(), stream);

return tile_norm;
Expand All @@ -120,7 +120,7 @@ void to_device(
btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>> &tile) {
auto &stream = detail::get_stream_based_on_range(tile.range());

TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
tile.tensor().storage(), stream);

return norm(tile.tensor());
Expand Down Expand Up @@ -218,7 +218,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
using PinnedTile =
btas::Tensor<T, TA::Range,
::btas::varray<typename Storage::value_type,
TiledArray::cuda_pinned_allocator<T>>>;
TiledArray::device_pinned_allocator<T>>>;
using PinnedMatrix = TA::DistArray<TA::Tile<PinnedTile>>;
// using TAMatrix = TA::DistArray<TA::Tensor<T>>;

Expand Down Expand Up @@ -339,7 +339,7 @@ int try_main(int argc, char **argv) {
<< std::endl
<< "Usage: " << argv[0]
<< " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
"[storage type = cuda_um_btas_varray]\n";
"[storage type = device_um_btas_varray]\n";
return 0;
}
const long Nm = atol(argv[1]);
Expand Down Expand Up @@ -376,15 +376,15 @@ int try_main(int argc, char **argv) {
return 1;
}

const auto storage_type =
(argc >= 10) ? std::string(argv[9]) : std::string{"cuda_um_btas_varray"};
const auto storage_type = (argc >= 10) ? std::string(argv[9])
: std::string{"device_um_btas_varray"};

if (storage_type != "cuda_um_btas_varray" &&
if (storage_type != "device_um_btas_varray" &&
storage_type != "cuda_um_thrust_vector" &&
storage_type != "cpu_cuda_vector") {
std::cerr << "Error: invalid storage type: " << storage_type
<< "\n Valid option includes: cuda_um_vector or "
"cuda_um_btas_varray or cuda_um_thrust_vector "
"device_um_btas_varray or cuda_um_thrust_vector "
"or cpu_cuda_vector. \n";
}
std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
Expand All @@ -407,13 +407,13 @@ int try_main(int argc, char **argv) {
<< runtimeVersion << std::endl;

{ // print device properties
int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();

if (num_cuda_devices <= 0) {
throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
}

int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
int cuda_device_id = TA::deviceEnv::instance()->current_device_id();

int mpi_size = world.size();
int mpi_rank = world.rank();
Expand All @@ -440,9 +440,9 @@ int try_main(int argc, char **argv) {
error = cudaDeviceGetAttribute(
&result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
std::cout << " attrConcurrentManagedAccess = " << result << std::endl;
error = cudaSetDevice(cuda_device_id);
error = device::setDevice(cuda_device_id);
if (error != cudaSuccess) {
std::cout << "error(cudaSetDevice) = " << error << std::endl;
std::cout << "error(device::setDevice) = " << error << std::endl;
}
size_t free_mem, total_mem;
error = cudaMemGetInfo(&free_mem, &total_mem);
Expand All @@ -462,19 +462,19 @@ int try_main(int argc, char **argv) {
// do_main_body<TiledArray::cpu_cuda_vector<float>>(world, Nm, Bm, Nn,
// Bn,
// Nk, Bk, nrepeat);
// } else if (storage_type == "cuda_um_btas_varray") {
if (storage_type == "cuda_um_btas_varray") {
// } else if (storage_type == "device_um_btas_varray") {
if (storage_type == "device_um_btas_varray") {
if (scalar_type_str == "double")
do_main_body<TiledArray::cuda_um_btas_varray<double>>(
do_main_body<TiledArray::device_um_btas_varray<double>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else if (scalar_type_str == "float")
do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
Bn, Nk, Bk, nrepeat);
do_main_body<TiledArray::device_um_btas_varray<float>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else if (scalar_type_str == "zdouble")
do_main_body<TiledArray::cuda_um_btas_varray<std::complex<double>>>(
do_main_body<TiledArray::device_um_btas_varray<std::complex<double>>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else if (scalar_type_str == "zfloat")
do_main_body<TiledArray::cuda_um_btas_varray<std::complex<float>>>(
do_main_body<TiledArray::device_um_btas_varray<std::complex<float>>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else {
abort(); // unreachable
Expand Down
10 changes: 5 additions & 5 deletions examples/cuda/ta_reduce_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
// clang-format off

#include <tiledarray.h>
#include <TiledArray/cuda/btas_um_tensor.h>
#include <TiledArray/device/btas_um_tensor.h>
// clang-format on

template <typename Tile>
Expand Down Expand Up @@ -298,13 +298,13 @@ int try_main(int argc, char **argv) {
<< runtimeVersion << std::endl;

{ // print device properties
int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();

if (num_cuda_devices <= 0) {
throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
}

int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
int cuda_device_id = TA::deviceEnv::instance()->current_device_id();

int mpi_size = world.size();
int mpi_rank = world.rank();
Expand All @@ -331,9 +331,9 @@ int try_main(int argc, char **argv) {
error = cudaDeviceGetAttribute(
&result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
std::cout << " attrConcurrentManagedAccess = " << result << std::endl;
error = cudaSetDevice(cuda_device_id);
error = device::setDevice(cuda_device_id);
if (error != cudaSuccess) {
std::cout << "error(cudaSetDevice) = " << error << std::endl;
std::cout << "error(device::setDevice) = " << error << std::endl;
}
size_t free_mem, total_mem;
error = cudaMemGetInfo(&free_mem, &total_mem);
Expand Down
12 changes: 6 additions & 6 deletions examples/cuda/ta_vector_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
// clang-format off

#include <tiledarray.h>
#include <TiledArray/cuda/btas_um_tensor.h>
#include "TiledArray/cuda/cpu_cuda_vector.h"
#include <TiledArray/device/btas_um_tensor.h>
#include "TiledArray/device/cpu_cuda_vector.h"
#include <TiledArray/external/btas.h>
// clang-format on

Expand Down Expand Up @@ -316,13 +316,13 @@ int try_main(int argc, char **argv) {
<< runtimeVersion << std::endl;

{ // print device properties
int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();

if (num_cuda_devices <= 0) {
throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
}

int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
int cuda_device_id = TA::deviceEnv::instance()->current_device_id();

int mpi_size = world.size();
int mpi_rank = world.rank();
Expand All @@ -349,9 +349,9 @@ int try_main(int argc, char **argv) {
error = cudaDeviceGetAttribute(
&result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
std::cout << " attrConcurrentManagedAccess = " << result << std::endl;
error = cudaSetDevice(cuda_device_id);
error = device::setDevice(cuda_device_id);
if (error != cudaSuccess) {
std::cout << "error(cudaSetDevice) = " << error << std::endl;
std::cout << "error(device::setDevice) = " << error << std::endl;
}
size_t free_mem, total_mem;
error = cudaMemGetInfo(&free_mem, &total_mem);
Expand Down
54 changes: 25 additions & 29 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,34 +207,30 @@ TiledArray/util/time.h
TiledArray/util/vector.h
)

if(CUDA_FOUND)
list(APPEND TILEDARRAY_HEADER_FILES
TiledArray/external/cuda.h
TiledArray/cuda/cublas.h
TiledArray/cuda/btas_cublas.h
TiledArray/cuda/btas_um_tensor.h
TiledArray/cuda/cpu_cuda_vector.h
TiledArray/cuda/cuda_task_fn.h
TiledArray/cuda/kernel/mult_kernel.h
TiledArray/cuda/kernel/mult_kernel_impl.h
TiledArray/cuda/kernel/reduce_kernel.h
TiledArray/cuda/kernel/reduce_kernel_impl.h
TiledArray/cuda/platform.h
TiledArray/cuda/thrust.h
TiledArray/cuda/allocators.h
TiledArray/cuda/um_storage.h)
endif(CUDA_FOUND)

if(HIP_FOUND)
list(APPEND TILEDARRAY_HEADER_FILES
TiledArray/external/hip.h)
endif(CUDA_FOUND)

if(HIP_FOUND OR CUDA_FOUND)
list(APPEND TILEDARRAY_HEADER_FILES
TiledArray/external/device.h
TiledArray/external/librett.h)
endif()

if(CUDA_FOUND)
list(APPEND TILEDARRAY_HEADER_FILES
TiledArray/external/cuda.h
TiledArray/device/cublas.h
TiledArray/device/btas_cublas.h
TiledArray/device/btas_um_tensor.h
TiledArray/device/cpu_cuda_vector.h
TiledArray/device/device_task_fn.h
TiledArray/device/kernel/mult_kernel.h
TiledArray/device/kernel/mult_kernel_impl.h
TiledArray/device/kernel/reduce_kernel.h
TiledArray/device/kernel/reduce_kernel_impl.h
TiledArray/device/platform.h
TiledArray/device/thrust.h
TiledArray/device/allocators.h
TiledArray/device/um_storage.h)
endif(CUDA_FOUND)

set(TILEDARRAY_SOURCE_FILES
TiledArray/tiledarray.cpp
TiledArray/tensor/tensor.cpp
Expand Down Expand Up @@ -263,11 +259,11 @@ set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers
if(CUDA_FOUND)

set(TILEDARRAY_CUDA_SOURCE_FILES
TiledArray/cuda/btas_um_tensor.cpp
TiledArray/cuda/cpu_cuda_vector.cu
TiledArray/cuda/kernel/mult_kernel.cu
TiledArray/cuda/kernel/reduce_kernel.cu
TiledArray/cuda/um_storage.cu)
TiledArray/device/btas_um_tensor.cpp
TiledArray/device/cpu_cuda_vector.cu
TiledArray/device/kernel/mult_kernel.cu
TiledArray/device/kernel/reduce_kernel.cu
TiledArray/device/um_storage.cu)

list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}")

Expand All @@ -277,7 +273,7 @@ if(CUDA_FOUND)
INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
endforeach()

set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp
set_source_files_properties(TiledArray/device/btas_um_tensor.cpp
PROPERTIES
LANGUAGE CUDA)

Expand Down
11 changes: 11 additions & 0 deletions src/TiledArray/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@
/* Define if TiledArray configured with HIP support */
#cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@

// Umpire and LibreTT limited to 1 device runtime at a time, so is TA
#if defined(TILEDARRAY_HAS_HIP)
# define TILEDARRAY_HAS_DEVICE 1
# define TILEDARRAY_DEVICE_RUNTIME HIP
# define TILEDARRAY_DEVICE_RUNTIME_STR "HIP"
#elif defined(TILEDARRAY_HAS_CUDA)
# define TILEDARRAY_HAS_DEVICE 1
# define TILEDARRAY_DEVICE_RUNTIME CUDA
# define TILEDARRAY_DEVICE_RUNTIME_STR "CUDA"
#endif

/* Is TA::Tensor memory profiling enabled? */
#cmakedefine TA_TENSOR_MEM_PROFILE 1

Expand Down
Loading

0 comments on commit 4ae7959

Please sign in to comment.