From 8beeb6e546da36483a2d3feb3a1fa3071cb5938c Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 18 Sep 2023 17:05:56 -0400
Subject: [PATCH] phase 1 of cuda -> device conversion

- cuda.h -> device.h
- {cuda,hip} -> device for many API elements
---
 examples/cuda/cuda_librett.cpp                |   2 +-
 examples/cuda/cuda_task.cpp                   |  12 +-
 examples/cuda/ta_cc_abcd_cuda.cpp             |   4 +-
 examples/cuda/ta_dense_cuda.cpp               |  42 +-
 examples/cuda/ta_reduce_cuda.cpp              |  10 +-
 examples/cuda/ta_vector_cuda.cpp              |  12 +-
 src/CMakeLists.txt                            |  54 +-
 src/TiledArray/config.h.in                    |  11 +
 src/TiledArray/cuda/btas_um_tensor.cpp        |  51 --
 src/TiledArray/{cuda => device}/allocators.h  |  53 +-
 src/TiledArray/{cuda => device}/btas_cublas.h | 288 +++++-----
 src/TiledArray/device/btas_um_tensor.cpp      |  52 ++
 .../{cuda => device}/btas_um_tensor.h         | 210 +++----
 .../{cuda => device}/cpu_cuda_vector.cu       |   2 +-
 .../{cuda => device}/cpu_cuda_vector.h        |  12 +-
 src/TiledArray/{cuda => device}/cublas.h      |   0
 .../device_task_fn.h}                         | 287 +++++-----
 .../{cuda => device}/kernel/mult_kernel.cu    |   4 +-
 .../{cuda => device}/kernel/mult_kernel.h     |   0
 .../kernel/mult_kernel_impl.h                 |   4 +-
 .../{cuda => device}/kernel/reduce_kernel.cu  |   4 +-
 .../{cuda => device}/kernel/reduce_kernel.h   |   0
 .../kernel/reduce_kernel_impl.h               |   8 +-
 src/TiledArray/{cuda => device}/platform.h    |   8 +-
 src/TiledArray/{cuda => device}/thrust.h      |   0
 src/TiledArray/{cuda => device}/um_storage.cu |  16 +-
 src/TiledArray/{cuda => device}/um_storage.h  |  39 +-
 src/TiledArray/dist_eval/binary_eval.h        |   8 +-
 src/TiledArray/dist_eval/contraction_eval.h   |  19 +-
 src/TiledArray/dist_eval/dist_eval.h          |   2 +-
 src/TiledArray/dist_eval/unary_eval.h         |   6 +-
 src/TiledArray/expressions/expr.h             |  22 +-
 src/TiledArray/external/cuda.h                | 512 +-----------------
 src/TiledArray/external/{hip.h => device.h}   | 371 ++++++++++---
 src/TiledArray/external/umpire.h              |  32 +-
 src/TiledArray/fwd.h                          |  39 +-
 src/TiledArray/host/allocator.h               |   9 +-
 src/TiledArray/host/env.h                     |   4 +-
 src/TiledArray/reduce_task.h                  | 111 ++--
 src/TiledArray/tensor/type_traits.h           |  10 +-
 src/TiledArray/tiledarray.cpp                 |  40 +-
 tests/CMakeLists.txt                          |   6 +-
 tests/expressions_cuda_um.cpp                 |   6 +-
 tests/librett.cpp                             |  18 +-
 tests/tensor_um.cpp                           |   2 +-
 45 files changed, 1071 insertions(+), 1331 deletions(-)
 delete mode 100644 src/TiledArray/cuda/btas_um_tensor.cpp
 rename src/TiledArray/{cuda => device}/allocators.h (63%)
 rename src/TiledArray/{cuda => device}/btas_cublas.h (66%)
 create mode 100644 src/TiledArray/device/btas_um_tensor.cpp
 rename src/TiledArray/{cuda => device}/btas_um_tensor.h (81%)
 rename src/TiledArray/{cuda => device}/cpu_cuda_vector.cu (98%)
 rename src/TiledArray/{cuda => device}/cpu_cuda_vector.h (96%)
 rename src/TiledArray/{cuda => device}/cublas.h (100%)
 rename src/TiledArray/{cuda/cuda_task_fn.h => device/device_task_fn.h} (72%)
 rename src/TiledArray/{cuda => device}/kernel/mult_kernel.cu (96%)
 rename src/TiledArray/{cuda => device}/kernel/mult_kernel.h (100%)
 rename src/TiledArray/{cuda => device}/kernel/mult_kernel_impl.h (95%)
 rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.cu (98%)
 rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.h (100%)
 rename src/TiledArray/{cuda => device}/kernel/reduce_kernel_impl.h (95%)
 rename src/TiledArray/{cuda => device}/platform.h (93%)
 rename src/TiledArray/{cuda => device}/thrust.h (100%)
 rename src/TiledArray/{cuda => device}/um_storage.cu (66%)
 rename src/TiledArray/{cuda => device}/um_storage.h (78%)
 rename src/TiledArray/external/{hip.h => device.h} (53%)

diff --git a/examples/cuda/cuda_librett.cpp b/examples/cuda/cuda_librett.cpp
index c513f41af1..1460f54117 100644
--- a/examples/cuda/cuda_librett.cpp
+++ b/examples/cuda/cuda_librett.cpp
@@ -23,7 +23,7 @@
 
 #ifdef TILEDARRAY_HAS_CUDA
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <tiledarray.h>
 
 #include <iostream>
diff --git a/examples/cuda/cuda_task.cpp b/examples/cuda/cuda_task.cpp
index a019523b6e..f2b0b2ab1b 100644
--- a/examples/cuda/cuda_task.cpp
+++ b/examples/cuda/cuda_task.cpp
@@ -2,8 +2,8 @@
 // Created by Chong Peng on 11/14/18.
 //
 
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include <TiledArray/cuda/cuda_task_fn.h>
+#include <TiledArray/device/btas_um_tensor.h>
+#include <TiledArray/device/device_task_fn.h>
 #include <tiledarray.h>
 
 using value_type = double;
@@ -28,8 +28,8 @@ void verify(const tile_type& tile, value_type value, std::size_t index) {
 
 tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream,
                 std::size_t index) {
-  CudaSafeCall(
-      cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(
+      TiledArray::deviceEnv::instance()->current_device_id()));
   /// make result Tensor
   using Storage = typename tile_type::tensor_type::storage_type;
   Storage result_storage;
@@ -81,7 +81,7 @@ void process_task(madness::World* world,
     tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*,
                           std::size_t) = &::scale;
 
-    madness::Future<tile_type> scale_future = madness::add_cuda_task(
+    madness::Future<tile_type> scale_future = madness::add_device_task(
         *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i);
 
     /// this should start until scale_taskfn is finished
@@ -98,7 +98,7 @@ int try_main(int argc, char** argv) {
   std::vector<cudaStream_t> streams(n_stream);
   for (auto& stream : streams) {
     // create the streams
-    CudaSafeCall(cudaStreamCreate(&stream));
+    DeviceSafeCall(cudaStreamCreate(&stream));
     //    std::cout << "stream: " << stream << "\n";
   }
 
diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp
index 0887c90562..b531dee495 100644
--- a/examples/cuda/ta_cc_abcd_cuda.cpp
+++ b/examples/cuda/ta_cc_abcd_cuda.cpp
@@ -17,7 +17,7 @@
  *
  */
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -186,7 +186,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
                          std::pow(n_uocc, 4) / std::pow(1024., 3);
 
   using CUDATile =
-      btas::Tensor<T, TA::Range, TiledArray::cuda_um_btas_varray<T>>;
+      btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
   using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
 
   // Construct tensors
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
index ab8c118622..864938302c 100644
--- a/examples/cuda/ta_dense_cuda.cpp
+++ b/examples/cuda/ta_dense_cuda.cpp
@@ -24,8 +24,8 @@
 // clang-format off
 
 #include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include "TiledArray/cuda/cpu_cuda_vector.h"
+#include <TiledArray/device/btas_um_tensor.h>
+#include "TiledArray/device/cpu_cuda_vector.h"
 #include <TiledArray/external/btas.h>
 // clang-format on
 
@@ -98,7 +98,7 @@ void to_host(
         // do norm on GPU
         auto tile_norm = norm(tile.tensor());
 
-        TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
+        TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
             tile.tensor().storage(), stream);
 
         return tile_norm;
@@ -120,7 +120,7 @@ void to_device(
           btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>> &tile) {
         auto &stream = detail::get_stream_based_on_range(tile.range());
 
-        TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+        TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
             tile.tensor().storage(), stream);
 
         return norm(tile.tensor());
@@ -218,7 +218,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   using PinnedTile =
       btas::Tensor<T, TA::Range,
                    ::btas::varray<typename Storage::value_type,
-                                  TiledArray::cuda_pinned_allocator<T>>>;
+                                  TiledArray::device_pinned_allocator<T>>>;
   using PinnedMatrix = TA::DistArray<TA::Tile<PinnedTile>>;
   // using TAMatrix = TA::DistArray<TA::Tensor<T>>;
 
@@ -339,7 +339,7 @@ int try_main(int argc, char **argv) {
               << std::endl
               << "Usage: " << argv[0]
               << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
-                 "[storage type = cuda_um_btas_varray]\n";
+                 "[storage type = device_um_btas_varray]\n";
     return 0;
   }
   const long Nm = atol(argv[1]);
@@ -376,15 +376,15 @@ int try_main(int argc, char **argv) {
     return 1;
   }
 
-  const auto storage_type =
-      (argc >= 10) ? std::string(argv[9]) : std::string{"cuda_um_btas_varray"};
+  const auto storage_type = (argc >= 10) ? std::string(argv[9])
+                                         : std::string{"device_um_btas_varray"};
 
-  if (storage_type != "cuda_um_btas_varray" &&
+  if (storage_type != "device_um_btas_varray" &&
       storage_type != "cuda_um_thrust_vector" &&
       storage_type != "cpu_cuda_vector") {
     std::cerr << "Error: invalid storage type: " << storage_type
               << "\n Valid option includes: cuda_um_vector or "
-                 "cuda_um_btas_varray or cuda_um_thrust_vector "
+                 "device_um_btas_varray or cuda_um_thrust_vector "
                  "or cpu_cuda_vector. \n";
   }
   std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
@@ -407,13 +407,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
+    int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();
 
     if (num_cuda_devices <= 0) {
       throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
     }
 
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
+    int cuda_device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();
@@ -440,9 +440,9 @@ int try_main(int argc, char **argv) {
         error = cudaDeviceGetAttribute(
             &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
         std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
+        error = device::setDevice(cuda_device_id);
         if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
+          std::cout << "error(device::setDevice) = " << error << std::endl;
         }
         size_t free_mem, total_mem;
         error = cudaMemGetInfo(&free_mem, &total_mem);
@@ -462,19 +462,19 @@ int try_main(int argc, char **argv) {
   //      do_main_body<TiledArray::cpu_cuda_vector<float>>(world, Nm, Bm, Nn,
   //      Bn,
   //                                                       Nk, Bk, nrepeat);
-  //  } else if (storage_type == "cuda_um_btas_varray") {
-  if (storage_type == "cuda_um_btas_varray") {
+  //  } else if (storage_type == "device_um_btas_varray") {
+  if (storage_type == "device_um_btas_varray") {
     if (scalar_type_str == "double")
-      do_main_body<TiledArray::cuda_um_btas_varray<double>>(
+      do_main_body<TiledArray::device_um_btas_varray<double>>(
           world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
     else if (scalar_type_str == "float")
-      do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
-                                                           Bn, Nk, Bk, nrepeat);
+      do_main_body<TiledArray::device_um_btas_varray<float>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
     else if (scalar_type_str == "zdouble")
-      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<double>>>(
+      do_main_body<TiledArray::device_um_btas_varray<std::complex<double>>>(
           world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
     else if (scalar_type_str == "zfloat")
-      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<float>>>(
+      do_main_body<TiledArray::device_um_btas_varray<std::complex<float>>>(
           world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
     else {
       abort();  // unreachable
diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp
index e453069892..b475ff78ef 100644
--- a/examples/cuda/ta_reduce_cuda.cpp
+++ b/examples/cuda/ta_reduce_cuda.cpp
@@ -24,7 +24,7 @@
 // clang-format off
 
 #include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 // clang-format on
 
 template <typename Tile>
@@ -298,13 +298,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
+    int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();
 
     if (num_cuda_devices <= 0) {
       throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
     }
 
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
+    int cuda_device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();
@@ -331,9 +331,9 @@ int try_main(int argc, char **argv) {
         error = cudaDeviceGetAttribute(
             &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
         std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
+        error = device::setDevice(cuda_device_id);
         if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
+          std::cout << "error(device::setDevice) = " << error << std::endl;
         }
         size_t free_mem, total_mem;
         error = cudaMemGetInfo(&free_mem, &total_mem);
diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp
index 1593a68e8b..a82a057807 100644
--- a/examples/cuda/ta_vector_cuda.cpp
+++ b/examples/cuda/ta_vector_cuda.cpp
@@ -24,8 +24,8 @@
 // clang-format off
 
 #include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include "TiledArray/cuda/cpu_cuda_vector.h"
+#include <TiledArray/device/btas_um_tensor.h>
+#include "TiledArray/device/cpu_cuda_vector.h"
 #include <TiledArray/external/btas.h>
 // clang-format on
 
@@ -316,13 +316,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
+    int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices();
 
     if (num_cuda_devices <= 0) {
       throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
     }
 
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
+    int cuda_device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();
@@ -349,9 +349,9 @@ int try_main(int argc, char **argv) {
         error = cudaDeviceGetAttribute(
             &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
         std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
+        error = device::setDevice(cuda_device_id);
         if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
+          std::cout << "error(device::setDevice) = " << error << std::endl;
         }
         size_t free_mem, total_mem;
         error = cudaMemGetInfo(&free_mem, &total_mem);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e4a3b0211e..5dd9c234fd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -207,34 +207,30 @@ TiledArray/util/time.h
 TiledArray/util/vector.h
 )
 
-if(CUDA_FOUND)
-  list(APPEND TILEDARRAY_HEADER_FILES
-     TiledArray/external/cuda.h
-     TiledArray/cuda/cublas.h
-     TiledArray/cuda/btas_cublas.h
-     TiledArray/cuda/btas_um_tensor.h
-     TiledArray/cuda/cpu_cuda_vector.h
-     TiledArray/cuda/cuda_task_fn.h
-     TiledArray/cuda/kernel/mult_kernel.h
-     TiledArray/cuda/kernel/mult_kernel_impl.h
-     TiledArray/cuda/kernel/reduce_kernel.h
-     TiledArray/cuda/kernel/reduce_kernel_impl.h
-     TiledArray/cuda/platform.h
-     TiledArray/cuda/thrust.h
-     TiledArray/cuda/allocators.h
-     TiledArray/cuda/um_storage.h)
-endif(CUDA_FOUND)
-
-if(HIP_FOUND)
-  list(APPEND TILEDARRAY_HEADER_FILES
-          TiledArray/external/hip.h)
-endif(CUDA_FOUND)
-
 if(HIP_FOUND OR CUDA_FOUND)
   list(APPEND TILEDARRAY_HEADER_FILES
+          TiledArray/external/device.h
           TiledArray/external/librett.h)
 endif()
 
+if(CUDA_FOUND)
+  list(APPEND TILEDARRAY_HEADER_FILES
+     TiledArray/external/cuda.h
+     TiledArray/device/cublas.h
+     TiledArray/device/btas_cublas.h
+     TiledArray/device/btas_um_tensor.h
+     TiledArray/device/cpu_cuda_vector.h
+          TiledArray/device/device_task_fn.h
+     TiledArray/device/kernel/mult_kernel.h
+     TiledArray/device/kernel/mult_kernel_impl.h
+     TiledArray/device/kernel/reduce_kernel.h
+     TiledArray/device/kernel/reduce_kernel_impl.h
+     TiledArray/device/platform.h
+     TiledArray/device/thrust.h
+     TiledArray/device/allocators.h
+     TiledArray/device/um_storage.h)
+endif(CUDA_FOUND)
+
 set(TILEDARRAY_SOURCE_FILES
 TiledArray/tiledarray.cpp
 TiledArray/tensor/tensor.cpp
@@ -263,11 +259,11 @@ set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers
 if(CUDA_FOUND)
 
   set(TILEDARRAY_CUDA_SOURCE_FILES
-          TiledArray/cuda/btas_um_tensor.cpp
-          TiledArray/cuda/cpu_cuda_vector.cu
-          TiledArray/cuda/kernel/mult_kernel.cu
-          TiledArray/cuda/kernel/reduce_kernel.cu
-          TiledArray/cuda/um_storage.cu)
+          TiledArray/device/btas_um_tensor.cpp
+          TiledArray/device/cpu_cuda_vector.cu
+          TiledArray/device/kernel/mult_kernel.cu
+          TiledArray/device/kernel/reduce_kernel.cu
+          TiledArray/device/um_storage.cu)
 
   list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}")
 
@@ -277,7 +273,7 @@ if(CUDA_FOUND)
           INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
   endforeach()
 
-  set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp
+  set_source_files_properties(TiledArray/device/btas_um_tensor.cpp
           PROPERTIES
           LANGUAGE CUDA)
 
diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
index 4cba5ee840..1c38298623 100644
--- a/src/TiledArray/config.h.in
+++ b/src/TiledArray/config.h.in
@@ -80,6 +80,17 @@
 /* Define if TiledArray configured with HIP support */
 #cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@
 
+// Umpire and LibreTT limited to 1 device runtime at a time, so is TA
+#if defined(TILEDARRAY_HAS_HIP)
+#  define TILEDARRAY_HAS_DEVICE 1
+#  define TILEDARRAY_DEVICE_RUNTIME HIP
+#  define TILEDARRAY_DEVICE_RUNTIME_STR "HIP"
+#elif defined(TILEDARRAY_HAS_CUDA)
+#  define TILEDARRAY_HAS_DEVICE 1
+#  define TILEDARRAY_DEVICE_RUNTIME CUDA
+#  define TILEDARRAY_DEVICE_RUNTIME_STR "CUDA"
+#endif
+
 /* Is TA::Tensor memory profiling enabled? */
 #cmakedefine TA_TENSOR_MEM_PROFILE 1
 
diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp
deleted file mode 100644
index 9423e7563d..0000000000
--- a/src/TiledArray/cuda/btas_um_tensor.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//
-// Created by Chong Peng on 7/24/18.
-//
-
-// clang-format off
-#include <btas/array_adaptor.h>  // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14)
-#include <TiledArray/cuda/btas_um_tensor.h>
-// clang-format on
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-template class btas::varray<double, TiledArray::cuda_um_allocator<double>>;
-template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
-template class btas::varray<
-    std::complex<double>, TiledArray::cuda_um_allocator<std::complex<double>>>;
-template class btas::varray<std::complex<float>,
-                            TiledArray::cuda_um_allocator<std::complex<float>>>;
-template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
-template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
-
-template class btas::Tensor<double, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<double>>;
-template class btas::Tensor<float, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<float>>;
-template class btas::Tensor<
-    std::complex<double>, TiledArray::Range,
-    TiledArray::cuda_um_btas_varray<std::complex<double>>>;
-template class btas::Tensor<
-    std::complex<float>, TiledArray::Range,
-    TiledArray::cuda_um_btas_varray<std::complex<float>>>;
-template class btas::Tensor<int, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<int>>;
-template class btas::Tensor<long, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<long>>;
-
-template class TiledArray::Tile<btas::Tensor<
-    double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
-template class TiledArray::Tile<btas::Tensor<
-    float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
-template class TiledArray::Tile<
-    btas::Tensor<std::complex<double>, TiledArray::Range,
-                 TiledArray::cuda_um_btas_varray<std::complex<double>>>>;
-template class TiledArray::Tile<
-    btas::Tensor<std::complex<float>, TiledArray::Range,
-                 TiledArray::cuda_um_btas_varray<std::complex<float>>>>;
-template class TiledArray::Tile<
-    btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
-template class TiledArray::Tile<btas::Tensor<
-    long, TiledArray::Range, TiledArray::cuda_um_btas_varray<long>>>;
-
-#endif  // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/allocators.h b/src/TiledArray/device/allocators.h
similarity index 63%
rename from src/TiledArray/cuda/allocators.h
rename to src/TiledArray/device/allocators.h
index 72c5ae3b0e..ff3ed6a3ac 100644
--- a/src/TiledArray/cuda/allocators.h
+++ b/src/TiledArray/device/allocators.h
@@ -26,9 +26,9 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/external/umpire.h>
 
 #include <madness/world/archive.h>
@@ -39,37 +39,40 @@
 namespace TiledArray {
 
 template <class T, class StaticLock, typename UmpireAllocatorAccessor>
-class cuda_allocator_impl : public umpire_allocator_impl<T, StaticLock> {
+class umpire_based_allocator
+    : public umpire_based_allocator_impl<T, StaticLock> {
  public:
-  using base_type = umpire_allocator_impl<T, StaticLock>;
+  using base_type = umpire_based_allocator_impl<T, StaticLock>;
   using typename base_type::const_pointer;
   using typename base_type::const_reference;
   using typename base_type::pointer;
   using typename base_type::reference;
   using typename base_type::value_type;
 
-  cuda_allocator_impl() noexcept : base_type(&UmpireAllocatorAccessor{}()) {}
+  umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {}
 
   template <class U>
-  cuda_allocator_impl(
-      const cuda_allocator_impl<U, StaticLock, UmpireAllocatorAccessor>&
+  umpire_based_allocator(
+      const umpire_based_allocator<U, StaticLock, UmpireAllocatorAccessor>&
           rhs) noexcept
       : base_type(
-            static_cast<const umpire_allocator_impl<U, StaticLock>&>(rhs)) {}
+            static_cast<const umpire_based_allocator_impl<U, StaticLock>&>(
+                rhs)) {}
 
   template <typename T1, typename T2, class StaticLock_,
             typename UmpireAllocatorAccessor_>
   friend bool operator==(
-      const cuda_allocator_impl<T1, StaticLock_, UmpireAllocatorAccessor_>& lhs,
-      const cuda_allocator_impl<T2, StaticLock_, UmpireAllocatorAccessor_>&
+      const umpire_based_allocator<T1, StaticLock_, UmpireAllocatorAccessor_>&
+          lhs,
+      const umpire_based_allocator<T2, StaticLock_, UmpireAllocatorAccessor_>&
           rhs) noexcept;
-};  // class cuda_allocator_impl
+};  // class umpire_based_allocator
 
 template <class T1, class T2, class StaticLock,
           typename UmpireAllocatorAccessor>
 bool operator==(
-    const cuda_allocator_impl<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
-    const cuda_allocator_impl<T2, StaticLock, UmpireAllocatorAccessor>&
+    const umpire_based_allocator<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
+    const umpire_based_allocator<T2, StaticLock, UmpireAllocatorAccessor>&
         rhs) noexcept {
   return lhs.umpire_allocator() == rhs.umpire_allocator();
 }
@@ -77,8 +80,8 @@ bool operator==(
 template <class T1, class T2, class StaticLock,
           typename UmpireAllocatorAccessor>
 bool operator!=(
-    const cuda_allocator_impl<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
-    const cuda_allocator_impl<T2, StaticLock, UmpireAllocatorAccessor>&
+    const umpire_based_allocator<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
+    const umpire_based_allocator<T2, StaticLock, UmpireAllocatorAccessor>&
         rhs) noexcept {
   return !(lhs == rhs);
 }
@@ -87,13 +90,13 @@ namespace detail {
 
 struct get_um_allocator {
   umpire::Allocator& operator()() {
-    return cudaEnv::instance()->um_allocator();
+    return deviceEnv::instance()->um_allocator();
   }
 };
 
 struct get_pinned_allocator {
   umpire::Allocator& operator()() {
-    return cudaEnv::instance()->pinned_allocator();
+    return deviceEnv::instance()->pinned_allocator();
   }
 };
 
@@ -106,30 +109,30 @@ namespace archive {
 
 template <class Archive, class T, class StaticLock,
           typename UmpireAllocatorAccessor>
-struct ArchiveLoadImpl<Archive, TiledArray::cuda_allocator_impl<
+struct ArchiveLoadImpl<Archive, TiledArray::umpire_based_allocator<
                                     T, StaticLock, UmpireAllocatorAccessor>> {
   static inline void load(
       const Archive& ar,
-      TiledArray::cuda_allocator_impl<T, StaticLock, UmpireAllocatorAccessor>&
-          allocator) {
-    allocator = TiledArray::cuda_allocator_impl<T, StaticLock,
-                                                UmpireAllocatorAccessor>{};
+      TiledArray::umpire_based_allocator<T, StaticLock,
+                                         UmpireAllocatorAccessor>& allocator) {
+    allocator = TiledArray::umpire_based_allocator<T, StaticLock,
+                                                   UmpireAllocatorAccessor>{};
   }
 };
 
 template <class Archive, class T, class StaticLock,
           typename UmpireAllocatorAccessor>
-struct ArchiveStoreImpl<Archive, TiledArray::cuda_allocator_impl<
+struct ArchiveStoreImpl<Archive, TiledArray::umpire_based_allocator<
                                      T, StaticLock, UmpireAllocatorAccessor>> {
   static inline void store(
       const Archive& ar,
-      const TiledArray::cuda_allocator_impl<
+      const TiledArray::umpire_based_allocator<
           T, StaticLock, UmpireAllocatorAccessor>& allocator) {}
 };
 
 }  // namespace archive
 }  // namespace madness
 
-#endif  // TILEDARRAY_HAS_CUDA
+#endif  // TILEDARRAY_HAS_DEVICE
 
 #endif  // TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED
diff --git a/src/TiledArray/cuda/btas_cublas.h b/src/TiledArray/device/btas_cublas.h
similarity index 66%
rename from src/TiledArray/cuda/btas_cublas.h
rename to src/TiledArray/device/btas_cublas.h
index ea073d0a78..9ac97ce649 100644
--- a/src/TiledArray/cuda/btas_cublas.h
+++ b/src/TiledArray/device/btas_cublas.h
@@ -24,7 +24,7 @@
 #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
 #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
 
-#include <TiledArray/cuda/cublas.h>
+#include <TiledArray/device/cublas.h>
 #include <TiledArray/math/blas.h>
 
 #ifdef TILEDARRAY_HAS_CUDA
@@ -32,16 +32,16 @@
 #include <TiledArray/external/cuda.h>
 #include <btas/tensor.h>
 
-#include <TiledArray/cuda/kernel/mult_kernel.h>
-#include <TiledArray/cuda/kernel/reduce_kernel.h>
-#include <TiledArray/cuda/platform.h>
-#include <TiledArray/cuda/um_storage.h>
+#include <TiledArray/device/kernel/mult_kernel.h>
+#include <TiledArray/device/kernel/reduce_kernel.h>
+#include <TiledArray/device/platform.h>
+#include <TiledArray/device/um_storage.h>
 #include <TiledArray/math/gemm_helper.h>
 
 namespace TiledArray {
 
 template <typename T, typename Scalar, typename Range, typename Storage,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
     const btas::Tensor<T, Range, Storage> &left,
     const btas::Tensor<T, Range, Storage> &right, Scalar factor,
@@ -78,44 +78,44 @@ btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
   T factor_t = T(factor);
   T zero(0);
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
   //  typedef typename Tensor::storage_type storage_type;
   auto result_range =
       gemm_helper.make_result_range<Range>(left.range(), right.range());
 
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
+  auto &stream = detail::get_stream_based_on_range(result_range);
 
   // the result Tensor type
   typedef btas::Tensor<T, Range, Storage> Tensor;
   Tensor result;
 
   // check if stream is busy
-  //  auto stream_status = cudaStreamQuery(cuda_stream);
+  //  auto stream_status = cudaStreamQuery(stream);
 
   // if stream is completed, use GPU
   //  if (stream_status == cudaSuccess) {
   if (true) {
     Storage result_storage;
-    make_device_storage(result_storage, result_range.area(), cuda_stream);
+    make_device_storage(result_storage, result_range.area(), stream);
     result = Tensor(std::move(result_range), std::move(result_storage));
 
     // left and right are readonly!!
     //    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
     //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
+    //                  deviceEnv::instance()->current_device_id());
     //    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
     //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
+    //                  deviceEnv::instance()->current_device_id());
 
     // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        right.storage(), cuda_stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        right.storage(), stream);
 
     const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
+    CublasSafeCall(cublasSetStream(handle, stream));
 
     CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
                               to_cublas_op(gemm_helper.left_op()), n, m, k,
@@ -124,40 +124,41 @@ btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
                               device_data(result.storage()), n));
 
     // wait for cuda calls to finish
-    //    detail::thread_wait_cuda_stream(cuda_stream);
-    synchronize_stream(&cuda_stream);
+    //    detail::thread_wait_stream(stream);
+    device::synchronize_stream(&stream);
   }
   // otherwise, use CPU
   else {
     Storage result_storage(result_range.area());
     result = Tensor(std::move(result_range), std::move(result_storage));
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        result.storage(), cuda_stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        result.storage(), stream);
 
     // left and right are readonly!!
     cudaMemAdvise(device_data(left), left.size() * sizeof(T),
                   cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
+                  deviceEnv::instance()->current_device_id());
     cudaMemAdvise(device_data(right), right.size() * sizeof(T),
                   cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
+                  deviceEnv::instance()->current_device_id());
 
     // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        right.storage(), cuda_stream);
-
-    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
-                           k, factor_t, left.data(), lda, right.data(), ldb,
-                           zero, result.data(), n);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        right.storage(), stream);
+
+    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(),
+                                 m, n, k, factor_t, left.data(), lda,
+                                 right.data(), ldb, zero, result.data(), n);
   }
 
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void btas_tensor_gemm_cuda_impl(
     btas::Tensor<T, Range, Storage> &result,
     const btas::Tensor<T, Range, Storage> &left,
@@ -224,13 +225,13 @@ void btas_tensor_gemm_cuda_impl(
   const integer ldb =
       (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
   T factor_t = T(factor);
   T one(1);
   // check if stream is busy
-  //  auto stream_status = cudaStreamQuery(cuda_stream);
+  //  auto stream_status = cudaStreamQuery(stream);
 
   // if stream is completed, use GPU
   //  if (stream_status == cudaSuccess) {
@@ -238,50 +239,50 @@ void btas_tensor_gemm_cuda_impl(
     // left and right are readonly!!
     //    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
     //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
+    //                  deviceEnv::instance()->current_device_id());
     //    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
     //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
+    //                  deviceEnv::instance()->current_device_id());
 
     // prefetch all data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        right.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        result.storage(), cuda_stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        right.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        result.storage(), stream);
 
     const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
+    CublasSafeCall(cublasSetStream(handle, stream));
     CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
                               to_cublas_op(gemm_helper.left_op()), n, m, k,
                               &factor_t, device_data(right.storage()), ldb,
                               device_data(left.storage()), lda, &one,
                               device_data(result.storage()), n));
-    synchronize_stream(&cuda_stream);
+    device::synchronize_stream(&stream);
 
-    //    detail::thread_wait_cuda_stream(cuda_stream);
+    //    detail::thread_wait_stream(stream);
 
   } else {
     // left and right are readonly!!
     cudaMemAdvise(device_data(left), left.size() * sizeof(T),
                   cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
+                  deviceEnv::instance()->current_device_id());
     cudaMemAdvise(device_data(right), right.size() * sizeof(T),
                   cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
+                  deviceEnv::instance()->current_device_id());
 
     // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        right.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        result.storage(), cuda_stream);
-
-    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
-                           k, factor_t, left.data(), lda, right.data(), ldb,
-                           one, result.data(), n);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        right.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        result.storage(), stream);
+
+    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(),
+                                 m, n, k, factor_t, left.data(), lda,
+                                 right.data(), ldb, one, result.data(), n);
   }
 }
 
@@ -289,69 +290,69 @@ void btas_tensor_gemm_cuda_impl(
 template <typename T, typename Range, typename Storage>
 btas::Tensor<T, Range, Storage> btas_tensor_clone_cuda_impl(
     const btas::Tensor<T, Range, Storage> &arg) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
   Storage result_storage;
   auto result_range = arg.range();
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
+  auto &stream = detail::get_stream_based_on_range(result_range);
 
-  make_device_storage(result_storage, arg.size(), cuda_stream);
+  make_device_storage(result_storage, arg.size(), stream);
   btas::Tensor<T, Range, Storage> result(std::move(result_range),
                                          std::move(result_storage));
 
   // call cublasCopy
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
 
   CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
                             1, device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 /// result[i] = a * arg[i]
 template <typename T, typename Range, typename Storage, typename Scalar,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btas::Tensor<T, Range, Storage> btas_tensor_scale_cuda_impl(
     const btas::Tensor<T, Range, Storage> &arg, const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(arg.range());
   //  std::cout << "scale, tile offset: " << arg.range().offset() << " stream: "
-  //  << arg.range().offset() % cudaEnv::instance()->num_cuda_streams() << "\n";
+  //  << arg.range().offset() % deviceEnv::instance()->num_streams() << "\n";
 
   auto result = btas_tensor_clone_cuda_impl(arg);
 
   // call cublasScale
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
   CublasSafeCall(
       cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
 
   return result;
 }
 
 /// result[i] *= a
 template <typename T, typename Range, typename Storage, typename Scalar,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void btas_tensor_scale_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
                                     const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
   // call cublasScale
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
   CublasSafeCall(
       cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
 }
 
 /// result[i] = arg1[i] - a * arg2[i]
 template <typename T, typename Scalar, typename Range, typename Storage,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
     const btas::Tensor<T, Range, Storage> &arg1,
     const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
@@ -360,12 +361,12 @@ btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
   // revert the sign of a
   auto b = -a;
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
-  if (in_memory_space<MemorySpace::CUDA>(result.storage())) {
+  if (in_memory_space<MemorySpace::Device>(result.storage())) {
     const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
+    CublasSafeCall(cublasSetStream(handle, stream));
     CublasSafeCall(cublasAxpy(handle, result.size(), &b,
                               device_data(arg2.storage()), 1,
                               device_data(result.storage()), 1));
@@ -374,82 +375,85 @@ btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
     //    btas::axpy(1.0, arg, result);
   }
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 /// result[i] -= a * arg1[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void btas_tensor_subt_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
                                    const btas::Tensor<T, Range, Storage> &arg1,
                                    const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
   // revert the sign of a
   auto b = -a;
 
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
   CublasSafeCall(cublasAxpy(handle, result.size(), &b,
                             device_data(arg1.storage()), 1,
                             device_data(result.storage()), 1));
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
 }
 
 /// result[i] = arg1[i] + a * arg2[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btas::Tensor<T, Range, Storage> btas_tensor_add_cuda_impl(
     const btas::Tensor<T, Range, Storage> &arg1,
     const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
   auto result = btas_tensor_clone_cuda_impl(arg1);
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
   CublasSafeCall(cublasAxpy(handle, result.size(), &a,
                             device_data(arg2.storage()), 1,
                             device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 /// result[i] += a * arg[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void btas_tensor_add_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
                                   const btas::Tensor<T, Range, Storage> &arg,
                                   const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
-  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(result.storage(),cuda_stream);
-  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(arg.storage(),cuda_stream);
+  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(result.storage(),stream);
+  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(arg.storage(),stream);
 
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
   CublasSafeCall(cublasAxpy(handle, result.size(), &a,
                             device_data(arg.storage()), 1,
                             device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
 }
 
 /// result[i] = result[i] * arg[i]
 template <typename T, typename Range, typename Storage>
 void btas_tensor_mult_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
                                    const btas::Tensor<T, Range, Storage> &arg) {
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
+  auto &stream = detail::get_stream_based_on_range(result.range());
 
   std::size_t n = result.size();
 
   TA_ASSERT(n == arg.size());
 
-  mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id);
-  synchronize_stream(&cuda_stream);
+  mult_to_cuda_kernel(result.data(), arg.data(), n, stream, device_id);
+  device::synchronize_stream(&stream);
 }
 
 /// result[i] = arg1[i] * arg2[i]
@@ -461,19 +465,19 @@ btas::Tensor<T, Range, Storage> btas_tensor_mult_cuda_impl(
 
   TA_ASSERT(arg2.size() == n);
 
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-  CudaSafeCall(cudaSetDevice(device_id));
-  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
+  DeviceSafeCall(device::setDevice(device_id));
+  auto &stream = detail::get_stream_based_on_range(arg1.range());
 
   Storage result_storage;
-  make_device_storage(result_storage, n, cuda_stream);
+  make_device_storage(result_storage, n, stream);
   btas::Tensor<T, Range, Storage> result(arg1.range(),
                                          std::move(result_storage));
 
-  mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream,
+  mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, stream,
                    device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
@@ -481,24 +485,24 @@ btas::Tensor<T, Range, Storage> btas_tensor_mult_cuda_impl(
 template <typename T, typename Range, typename Storage>
 typename btas::Tensor<T, Range, Storage>::value_type
 btas_tensor_squared_norm_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
+  auto &stream = detail::get_stream_based_on_range(arg.range());
 
   auto &storage = arg.storage();
   using TiledArray::math::blas::integer;
   integer size = storage.size();
   T result = 0;
-  if (in_memory_space<MemorySpace::CUDA>(storage)) {
+  if (in_memory_space<MemorySpace::Device>(storage)) {
     const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
+    CublasSafeCall(cublasSetStream(handle, stream));
     CublasSafeCall(cublasDot(handle, size, device_data(storage), 1,
                              device_data(storage), 1, &result));
   } else {
     TA_ASSERT(false);
     //    result = TiledArray::math::dot(size, storage.data(), storage.data());
   }
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
@@ -507,9 +511,9 @@ template <typename T, typename Range, typename Storage>
 typename btas::Tensor<T, Range, Storage>::value_type btas_tensor_dot_cuda_impl(
     const btas::Tensor<T, Range, Storage> &arg1,
     const btas::Tensor<T, Range, Storage> &arg2) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
-  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
+  auto &stream = detail::get_stream_based_on_range(arg1.range());
 
   using TiledArray::math::blas::integer;
   integer size = arg1.storage().size();
@@ -517,101 +521,101 @@ typename btas::Tensor<T, Range, Storage>::value_type btas_tensor_dot_cuda_impl(
   TA_ASSERT(size == arg2.storage().size());
 
   T result = 0;
-  if (in_memory_space<MemorySpace::CUDA>(arg1.storage()) &&
-      in_memory_space<MemorySpace::CUDA>(arg2.storage())) {
+  if (in_memory_space<MemorySpace::Device>(arg1.storage()) &&
+      in_memory_space<MemorySpace::Device>(arg2.storage())) {
     const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
+    CublasSafeCall(cublasSetStream(handle, stream));
     CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1,
                              device_data(arg2.storage()), 1, &result));
   } else {
     TA_ASSERT(false);
     //    result = TiledArray::math::dot(size, storage.data(), storage.data());
   }
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_sum_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = sum_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_product_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = product_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_min_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = min_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_max_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = max_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_absmin_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = absmin_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
 template <typename T, typename Range, typename Storage>
 T btas_tensor_absmax_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
+  auto &stream = detail::get_stream_based_on_range(arg.range());
+  auto device_id = deviceEnv::instance()->current_device_id();
 
   auto &storage = arg.storage();
   auto n = storage.size();
 
-  auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id);
+  auto result = absmax_cuda_kernel(arg.data(), n, stream, device_id);
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp
new file mode 100644
index 0000000000..270f30aad4
--- /dev/null
+++ b/src/TiledArray/device/btas_um_tensor.cpp
@@ -0,0 +1,52 @@
+//
+// Created by Chong Peng on 7/24/18.
+//
+
+// clang-format off
+#include <btas/array_adaptor.h>  // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14)
+#include <TiledArray/device/btas_um_tensor.h>
+// clang-format on
+
+#ifdef TILEDARRAY_HAS_CUDA
+
+template class btas::varray<double, TiledArray::device_um_allocator<double>>;
+template class btas::varray<float, TiledArray::device_um_allocator<float>>;
+template class btas::varray<
+    std::complex<double>,
+    TiledArray::device_um_allocator<std::complex<double>>>;
+template class btas::varray<
+    std::complex<float>, TiledArray::device_um_allocator<std::complex<float>>>;
+template class btas::varray<int, TiledArray::device_um_allocator<int>>;
+template class btas::varray<long, TiledArray::device_um_allocator<long>>;
+
+template class btas::Tensor<double, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<double>>;
+template class btas::Tensor<float, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<float>>;
+template class btas::Tensor<
+    std::complex<double>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<double>>>;
+template class btas::Tensor<
+    std::complex<float>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<float>>>;
+template class btas::Tensor<int, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<int>>;
+template class btas::Tensor<long, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<long>>;
+
+template class TiledArray::Tile<btas::Tensor<
+    double, TiledArray::Range, TiledArray::device_um_btas_varray<double>>>;
+template class TiledArray::Tile<btas::Tensor<
+    float, TiledArray::Range, TiledArray::device_um_btas_varray<float>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<double>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<double>>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<float>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<float>>>>;
+template class TiledArray::Tile<btas::Tensor<
+    int, TiledArray::Range, TiledArray::device_um_btas_varray<int>>>;
+template class TiledArray::Tile<btas::Tensor<
+    long, TiledArray::Range, TiledArray::device_um_btas_varray<long>>>;
+
+#endif  // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h
similarity index 81%
rename from src/TiledArray/cuda/btas_um_tensor.h
rename to src/TiledArray/device/btas_um_tensor.h
index 6342c54771..0c448a24b2 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/device/btas_um_tensor.h
@@ -27,11 +27,12 @@
 #include <tiledarray_fwd.h>
 
 #include <TiledArray/external/btas.h>
+#include <TiledArray/external/device.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/cuda/btas_cublas.h>
-#include <TiledArray/cuda/um_storage.h>
+#include <TiledArray/device/btas_cublas.h>
+#include <TiledArray/device/um_storage.h>
 #include <TiledArray/external/librett.h>
 #include <TiledArray/tile.h>
 
@@ -39,15 +40,15 @@ namespace TiledArray {
 
 namespace detail {
 template <typename T, typename Range>
-struct is_cuda_tile<
-    ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>>
+struct is_device_tile<
+    ::btas::Tensor<T, Range, TiledArray::device_um_btas_varray<T>>>
     : public std::true_type {};
 
 template <typename T>
-void to_cuda(const TiledArray::btasUMTensorVarray<T> &tile) {
-  cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
+void to_device(const TiledArray::btasUMTensorVarray<T> &tile) {
+  device::setDevice(TiledArray::deviceEnv::instance()->current_device_id());
   auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range());
-  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+  TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
       tile.storage(), stream);
 }
 
@@ -64,12 +65,12 @@ struct ArchiveLoadImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
   static inline void load(const Archive &ar,
                           TiledArray::btasUMTensorVarray<T> &t) {
     TiledArray::Range range{};
-    TiledArray::cuda_um_btas_varray<T> store{};
+    TiledArray::device_um_btas_varray<T> store{};
     ar &range &store;
     t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
-    // cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
+    // device::setDevice(TiledArray::deviceEnv::instance()->current_device_id());
     // auto &stream = TiledArray::detail::get_stream_based_on_range(range);
-    // TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(t.storage(),
+    // TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(t.storage(),
     // stream);
   }
 };
@@ -78,11 +79,11 @@ template <class Archive, typename T>
 struct ArchiveStoreImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
   static inline void store(const Archive &ar,
                            const TiledArray::btasUMTensorVarray<T> &t) {
-    CudaSafeCall(cudaSetDevice(
-        TiledArray::cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(TiledArray::device::setDevice(
+        TiledArray::deviceEnv::instance()->current_device_id()));
     auto &stream = TiledArray::detail::get_stream_based_on_range(t.range());
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(t.storage(),
-                                                                    stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        t.storage(), stream);
     ar &t.range() & t.storage();
   }
 };
@@ -135,25 +136,25 @@ btasUMTensorVarray<T, Range> shift(const btasUMTensorVarray<T, Range> &arg,
   // shift the range
   result_range.inplace_shift(range_shift);
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
   // @important select the stream using the shifted range
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
+  auto &stream = detail::get_stream_based_on_range(result_range);
 
   typename btasUMTensorVarray<T, Range>::storage_type result_storage;
 
-  make_device_storage(result_storage, result_range.volume(), cuda_stream);
+  make_device_storage(result_storage, result_range.volume(), stream);
   btasUMTensorVarray<T, Range> result(std::move(result_range),
                                       std::move(result_storage));
 
   // call cublasCopy
   const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
+  CublasSafeCall(cublasSetStream(handle, stream));
 
   CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
                             1, device_data(result.storage()), 1));
 
-  synchronize_stream(&cuda_stream);
+  device::synchronize_stream(&stream);
   return result;
 }
 
@@ -176,7 +177,7 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
                                      const TiledArray::Permutation &perm) {
   // compute result range
   auto result_range = perm * arg.range();
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+  DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id()));
 
   // compute the stream to use
   auto &stream = detail::get_stream_based_on_range(result_range);
@@ -192,7 +193,7 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
   librett_permute(const_cast<T *>(device_data(arg.storage())),
                   device_data(result.storage()), arg.range(), perm, stream);
 
-  synchronize_stream(&stream);
+  device::synchronize_stream(&stream);
 
   return result;
 }
@@ -205,7 +206,7 @@ template <typename T, typename Range, typename Scalar,
           typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
                                    const Scalar factor) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_scale_cuda_impl(arg, factor);
 }
 
@@ -213,7 +214,7 @@ template <typename T, typename Range, typename Scalar,
           typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> &scale_to(btasUMTensorVarray<T, Range> &arg,
                                        const Scalar factor) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   btas_tensor_scale_to_cuda_impl(arg, factor);
   return arg;
 }
@@ -227,8 +228,8 @@ btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
   auto result = scale(arg, factor);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -239,7 +240,7 @@ btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
 
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_scale_cuda_impl(arg, T(-1.0));
 }
 
@@ -251,15 +252,15 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
   auto result = neg(arg);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
 
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
   return arg;
 }
@@ -271,8 +272,8 @@ btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
   return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
 }
 
@@ -295,8 +296,8 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   auto result = subt(arg1, arg2);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -311,8 +312,8 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   auto result = subt(arg1, arg2, factor);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -325,8 +326,8 @@ template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &subt_to(
     btasUMTensorVarray<T, Range> &result,
     const btasUMTensorVarray<T, Range> &arg1) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg1);
+  detail::to_device(result);
+  detail::to_device(arg1);
   btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
   return result;
 }
@@ -348,8 +349,8 @@ btasUMTensorVarray<T, Range> &subt_to(btasUMTensorVarray<T, Range> &result,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
   return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
 }
 
@@ -373,8 +374,8 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   auto result = add(arg1, arg2, factor);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -388,8 +389,8 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   auto result = add(arg1, arg2);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -401,8 +402,8 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
                                      const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg);
+  detail::to_device(result);
+  detail::to_device(arg);
   btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
   return result;
 }
@@ -424,8 +425,8 @@ template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type dot(
     const btasUMTensorVarray<T, Range> &arg1,
     const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
   return btas_tensor_dot_cuda_impl(arg1, arg2);
 }
 
@@ -435,8 +436,8 @@ typename btasUMTensorVarray<T, Range>::value_type dot(
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
   return btas_tensor_mult_cuda_impl(arg1, arg2);
 }
 
@@ -459,8 +460,8 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   auto result = mult(arg1, arg2);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -475,8 +476,8 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   auto result = mult(arg1, arg2, factor);
 
   // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
+  auto stream = device::tls_stream_accessor();
+  device::streamSynchronize(*stream);
 
   return permute(result, perm);
 }
@@ -487,8 +488,8 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
                                       const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg);
+  detail::to_device(result);
+  detail::to_device(arg);
   btas_tensor_mult_to_cuda_impl(result, arg);
   return result;
 }
@@ -514,7 +515,7 @@ btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type squared_norm(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_squared_norm_cuda_impl(arg);
 }
 
@@ -525,7 +526,7 @@ typename btasUMTensorVarray<T, Range>::value_type squared_norm(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type norm(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg));
 }
 
@@ -544,7 +545,7 @@ typename btasUMTensorVarray<T, Range>::value_type trace(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type sum(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_sum_cuda_impl(arg);
 }
 
@@ -554,7 +555,7 @@ typename btasUMTensorVarray<T, Range>::value_type sum(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type product(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_product_cuda_impl(arg);
 }
 
@@ -564,7 +565,7 @@ typename btasUMTensorVarray<T, Range>::value_type product(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type max(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_max_cuda_impl(arg);
 }
 
@@ -574,7 +575,7 @@ typename btasUMTensorVarray<T, Range>::value_type max(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type abs_max(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_absmax_cuda_impl(arg);
 }
 
@@ -584,7 +585,7 @@ typename btasUMTensorVarray<T, Range>::value_type abs_max(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type min(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_min_cuda_impl(arg);
 }
 
@@ -594,7 +595,7 @@ typename btasUMTensorVarray<T, Range>::value_type min(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type abs_min(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
+  detail::to_device(arg);
   return btas_tensor_absmin_cuda_impl(arg);
 }
 
@@ -603,10 +604,11 @@ template <typename UMTensor, typename Policy>
 void to_host(
     TiledArray::DistArray<TiledArray::Tile<UMTensor>, Policy> &um_array) {
   auto to_host = [](TiledArray::Tile<UMTensor> &tile) {
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(
+        device::setDevice(deviceEnv::instance()->current_device_id()));
     auto &stream = detail::get_stream_based_on_range(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
         tile.tensor().storage(), stream);
   };
 
@@ -622,7 +624,7 @@ void to_host(
   }
 
   world.gop.fence();
-  CudaSafeCall(cudaDeviceSynchronize());
+  DeviceSafeCall(cudaDeviceSynchronize());
 };
 
 /// to device for UM Array
@@ -630,10 +632,11 @@ template <typename UMTensor, typename Policy>
 void to_device(
     TiledArray::DistArray<TiledArray::Tile<UMTensor>, Policy> &um_array) {
   auto to_device = [](TiledArray::Tile<UMTensor> &tile) {
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(
+        device::setDevice(deviceEnv::instance()->current_device_id()));
     auto &stream = detail::get_stream_based_on_range(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
         tile.tensor().storage(), stream);
   };
 
@@ -649,7 +652,7 @@ void to_device(
   }
 
   world.gop.fence();
-  CudaSafeCall(cudaDeviceSynchronize());
+  DeviceSafeCall(device::deviceSynchronize());
 };
 
 /// convert array from UMTensor to TiledArray::Tensor
@@ -661,12 +664,12 @@ um_tensor_to_ta_tensor(
   const auto convert_tile_memcpy = [](const UMTensor &tile) {
     TATensor result(tile.tensor().range());
 
-    auto &stream = cudaEnv::instance()->cuda_stream_d2h();
-    CudaSafeCall(
+    auto &stream = deviceEnv::instance()->stream_d2h();
+    DeviceSafeCall(
         cudaMemcpyAsync(result.data(), tile.data(),
                         tile.size() * sizeof(typename TATensor::value_type),
                         cudaMemcpyDefault, stream));
-    synchronize_stream(&stream);
+    device::synchronize_stream(&stream);
 
     return result;
   };
@@ -676,10 +679,11 @@ um_tensor_to_ta_tensor(
     using std::begin;
     const auto n = tile.tensor().size();
 
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(
+        device::setDevice(deviceEnv::instance()->current_device_id()));
     auto &stream = detail::get_stream_based_on_range(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
         tile.tensor().storage(), stream);
 
     std::copy_n(tile.data(), n, result.data());
@@ -714,28 +718,30 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
   auto convert_tile_memcpy = [](const TATensor &tile) {
     /// UMTensor must be wrapped into TA::Tile
 
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(
+        device::setDevice(deviceEnv::instance()->current_device_id()));
 
     using Tensor = typename UMTensor::tensor_type;
 
-    auto &stream = cudaEnv::instance()->cuda_stream_h2d();
+    auto &stream = deviceEnv::instance()->stream_h2d();
     typename Tensor::storage_type storage;
     make_device_storage(storage, tile.range().area(), stream);
     Tensor result(tile.range(), std::move(storage));
 
-    CudaSafeCall(
+    DeviceSafeCall(
         cudaMemcpyAsync(result.data(), tile.data(),
                         tile.size() * sizeof(typename Tensor::value_type),
                         cudaMemcpyDefault, stream));
 
-    synchronize_stream(&stream);
+    device::synchronize_stream(&stream);
     return TiledArray::Tile<Tensor>(std::move(result));
   };
 
   auto convert_tile_um = [](const TATensor &tile) {
     /// UMTensor must be wrapped into TA::Tile
 
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
+    DeviceSafeCall(
+        device::setDevice(deviceEnv::instance()->current_device_id()));
 
     using Tensor = typename UMTensor::tensor_type;
     typename Tensor::storage_type storage(tile.range().area());
@@ -749,7 +755,7 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
     auto &stream = detail::get_stream_based_on_range(result.range());
 
     // prefetch data to GPU
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
         result.storage(), stream);
 
     return TiledArray::Tile<Tensor>(std::move(result));
@@ -778,47 +784,49 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<UMTensor, Policy> &array) {
 #ifndef TILEDARRAY_HEADER_ONLY
 
 extern template class btas::varray<double,
-                                   TiledArray::cuda_um_allocator<double>>;
-extern template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
+                                   TiledArray::device_um_allocator<double>>;
+extern template class btas::varray<float,
+                                   TiledArray::device_um_allocator<float>>;
 extern template class btas::varray<
-    std::complex<double>, TiledArray::cuda_um_allocator<std::complex<double>>>;
+    std::complex<double>,
+    TiledArray::device_um_allocator<std::complex<double>>>;
 extern template class btas::varray<
-    std::complex<float>, TiledArray::cuda_um_allocator<std::complex<float>>>;
-extern template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
-extern template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
+    std::complex<float>, TiledArray::device_um_allocator<std::complex<float>>>;
+extern template class btas::varray<int, TiledArray::device_um_allocator<int>>;
+extern template class btas::varray<long, TiledArray::device_um_allocator<long>>;
 
 extern template class btas::Tensor<double, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<double>>;
+                                   TiledArray::device_um_btas_varray<double>>;
 extern template class btas::Tensor<float, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<float>>;
+                                   TiledArray::device_um_btas_varray<float>>;
 extern template class btas::Tensor<
     std::complex<double>, TiledArray::Range,
-    TiledArray::cuda_um_btas_varray<std::complex<double>>>;
+    TiledArray::device_um_btas_varray<std::complex<double>>>;
 extern template class btas::Tensor<
     std::complex<float>, TiledArray::Range,
-    TiledArray::cuda_um_btas_varray<std::complex<float>>>;
+    TiledArray::device_um_btas_varray<std::complex<float>>>;
 extern template class btas::Tensor<int, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<int>>;
+                                   TiledArray::device_um_btas_varray<int>>;
 extern template class btas::Tensor<long, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<long>>;
+                                   TiledArray::device_um_btas_varray<long>>;
 
 extern template class TiledArray::Tile<btas::Tensor<
-    double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
+    double, TiledArray::Range, TiledArray::device_um_btas_varray<double>>>;
 extern template class TiledArray::Tile<btas::Tensor<
-    float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
+    float, TiledArray::Range, TiledArray::device_um_btas_varray<float>>>;
 extern template class TiledArray::Tile<
     btas::Tensor<std::complex<double>, TiledArray::Range,
-                 TiledArray::cuda_um_btas_varray<std::complex<double>>>>;
+                 TiledArray::device_um_btas_varray<std::complex<double>>>>;
 extern template class TiledArray::Tile<
     btas::Tensor<std::complex<float>, TiledArray::Range,
-                 TiledArray::cuda_um_btas_varray<std::complex<float>>>>;
-extern template class TiledArray::Tile<
-    btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
+                 TiledArray::device_um_btas_varray<std::complex<float>>>>;
+extern template class TiledArray::Tile<btas::Tensor<
+    int, TiledArray::Range, TiledArray::device_um_btas_varray<int>>>;
 extern template class TiledArray::Tile<btas::Tensor<
-    long, TiledArray::Range, TiledArray::cuda_um_btas_varray<long>>>;
+    long, TiledArray::Range, TiledArray::device_um_btas_varray<long>>>;
 
 #endif  // TILEDARRAY_HEADER_ONLY
 
-#endif  // TILEDARRAY_HAS_CUDA
+#endif  // TILEDARRAY_HAS_DEVICE
 
 #endif  // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.cu b/src/TiledArray/device/cpu_cuda_vector.cu
similarity index 98%
rename from src/TiledArray/cuda/cpu_cuda_vector.cu
rename to src/TiledArray/device/cpu_cuda_vector.cu
index 6c58fdd123..639cc56acc 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.cu
+++ b/src/TiledArray/device/cpu_cuda_vector.cu
@@ -1,5 +1,5 @@
 
-#include <TiledArray/cuda/cpu_cuda_vector.h>
+#include <TiledArray/device/cpu_cuda_vector.h>
 
 
 namespace thrust {
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/device/cpu_cuda_vector.h
similarity index 96%
rename from src/TiledArray/cuda/cpu_cuda_vector.h
rename to src/TiledArray/device/cpu_cuda_vector.h
index 5a6e52beb5..8c7b32900a 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.h
+++ b/src/TiledArray/device/cpu_cuda_vector.h
@@ -4,8 +4,8 @@
 
 #include <btas/array_adaptor.h>
 
-#include <TiledArray/cuda/platform.h>
-#include <TiledArray/cuda/thrust.h>
+#include <TiledArray/device/platform.h>
+#include <TiledArray/device/thrust.h>
 
 #include <madness/world/archive.h>
 
@@ -165,8 +165,8 @@ template <MemorySpace Space, typename T, typename HostAlloc,
           typename DeviceAlloc>
 bool in_memory_space(
     const cpu_cuda_vector<T, HostAlloc, DeviceAlloc>& vec) noexcept {
-  return (vec.on_host() && overlap(MemorySpace::CPU, Space)) ||
-         (vec.on_device() && overlap(MemorySpace::CUDA, Space));
+  return (vec.on_host() && overlap(MemorySpace::Host, Space)) ||
+         (vec.on_device() && overlap(MemorySpace::Device, Space));
 }
 
 template <ExecutionSpace Space, typename T, typename HostAlloc,
@@ -174,11 +174,11 @@ template <ExecutionSpace Space, typename T, typename HostAlloc,
 void to_execution_space(cpu_cuda_vector<T, HostAlloc, DeviceAlloc>& vec,
                         cudaStream_t stream = 0) {
   switch (Space) {
-    case ExecutionSpace::CPU: {
+    case ExecutionSpace::Host: {
       vec.to_host();
       break;
     }
-    case ExecutionSpace::CUDA: {
+    case ExecutionSpace::Device: {
       vec.to_device();
       break;
     }
diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/device/cublas.h
similarity index 100%
rename from src/TiledArray/cuda/cublas.h
rename to src/TiledArray/device/cublas.h
diff --git a/src/TiledArray/cuda/cuda_task_fn.h b/src/TiledArray/device/device_task_fn.h
similarity index 72%
rename from src/TiledArray/cuda/cuda_task_fn.h
rename to src/TiledArray/device/device_task_fn.h
index 8de133b3bd..a4b9db92e4 100644
--- a/src/TiledArray/cuda/cuda_task_fn.h
+++ b/src/TiledArray/device/device_task_fn.h
@@ -7,23 +7,22 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/util/time.h>
-#include <cuda_runtime.h>
 #include <madness/world/taskfn.h>
 
 namespace TiledArray {
 namespace detail {
 
 template <int64_t CallabackId>
-std::atomic<int64_t>& cuda_callback_duration_ns() {
+std::atomic<int64_t>& device_callback_duration_ns() {
   static std::atomic<int64_t> value{0};
   return value;
 }
 
-inline std::atomic<int64_t>& cuda_taskfn_callback_duration_ns() {
+inline std::atomic<int64_t>& device_taskfn_callback_duration_ns() {
   static std::atomic<int64_t> value{0};
   return value;
 }
@@ -34,8 +33,8 @@ inline std::atomic<int64_t>& cuda_taskfn_callback_duration_ns() {
 namespace madness {
 
 ///
-/// cudaTaskFn class
-/// represent a task that calls an async cuda kernel
+/// deviceTaskFn class
+/// represent a task that calls an async device kernel
 /// the task must call synchronize_stream function to tell which stream it
 /// used
 ///
@@ -44,55 +43,55 @@ template <typename fnT, typename arg1T = void, typename arg2T = void,
           typename arg3T = void, typename arg4T = void, typename arg5T = void,
           typename arg6T = void, typename arg7T = void, typename arg8T = void,
           typename arg9T = void>
-struct cudaTaskFn : public TaskInterface {
-  static_assert(not(std::is_const<arg1T>::value ||
-                    std::is_reference<arg1T>::value),
-                "improper instantiation of cudaTaskFn, arg1T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg2T>::value ||
-                    std::is_reference<arg2T>::value),
-                "improper instantiation of cudaTaskFn, arg2T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg3T>::value ||
-                    std::is_reference<arg3T>::value),
-                "improper instantiation of cudaTaskFn, arg3T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg4T>::value ||
-                    std::is_reference<arg4T>::value),
-                "improper instantiation of cudaTaskFn, arg4T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg5T>::value ||
-                    std::is_reference<arg5T>::value),
-                "improper instantiation of cudaTaskFn, arg5T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg6T>::value ||
-                    std::is_reference<arg6T>::value),
-                "improper instantiation of cudaTaskFn, arg6T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg7T>::value ||
-                    std::is_reference<arg7T>::value),
-                "improper instantiation of cudaTaskFn, arg7T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg8T>::value ||
-                    std::is_reference<arg8T>::value),
-                "improper instantiation of cudaTaskFn, arg8T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg9T>::value ||
-                    std::is_reference<arg9T>::value),
-                "improper instantiation of cudaTaskFn, arg9T cannot be a const "
-                "or reference type");
+struct deviceTaskFn : public TaskInterface {
+  static_assert(
+      not(std::is_const<arg1T>::value || std::is_reference<arg1T>::value),
+      "improper instantiation of deviceTaskFn, arg1T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg2T>::value || std::is_reference<arg2T>::value),
+      "improper instantiation of deviceTaskFn, arg2T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg3T>::value || std::is_reference<arg3T>::value),
+      "improper instantiation of deviceTaskFn, arg3T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg4T>::value || std::is_reference<arg4T>::value),
+      "improper instantiation of deviceTaskFn, arg4T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg5T>::value || std::is_reference<arg5T>::value),
+      "improper instantiation of deviceTaskFn, arg5T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg6T>::value || std::is_reference<arg6T>::value),
+      "improper instantiation of deviceTaskFn, arg6T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg7T>::value || std::is_reference<arg7T>::value),
+      "improper instantiation of deviceTaskFn, arg7T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg8T>::value || std::is_reference<arg8T>::value),
+      "improper instantiation of deviceTaskFn, arg8T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg9T>::value || std::is_reference<arg9T>::value),
+      "improper instantiation of deviceTaskFn, arg9T cannot be a const "
+      "or reference type");
 
  private:
   /// This class type
-  typedef cudaTaskFn<fnT, arg1T, arg2T, arg3T, arg4T, arg5T, arg6T, arg7T,
-                     arg8T, arg9T>
-      cudaTaskFn_;
+  typedef deviceTaskFn<fnT, arg1T, arg2T, arg3T, arg4T, arg5T, arg6T, arg7T,
+                       arg8T, arg9T>
+      deviceTaskFn_;
 
   friend class AsyncTaskInterface;
 
-  /// internal Task structure that wraps the Async cuda function
+  /// internal Task structure that wraps the Async device function
   struct AsyncTaskInterface : public madness::TaskInterface {
-    AsyncTaskInterface(cudaTaskFn_* task, int ndepend = 0,
+    AsyncTaskInterface(deviceTaskFn_* task, int ndepend = 0,
                        const TaskAttributes attr = TaskAttributes())
         : TaskInterface(ndepend, attr), task_(task) {}
 
@@ -105,7 +104,7 @@ struct cudaTaskFn : public TaskInterface {
       task_->run_async();
 
       // get the stream used by async function
-      auto stream = TiledArray::tls_cudastream_accessor();
+      auto stream = TiledArray::device::tls_stream_accessor();
 
       //      TA_ASSERT(stream != nullptr);
 
@@ -113,32 +112,33 @@ struct cudaTaskFn : public TaskInterface {
       if (stream == nullptr) {
         task_->notify();
       } else {
-        // TODO should we use cuda callback or cuda events??
-        // insert cuda callback
-        cudaLaunchHostFunc(*stream, cuda_callback, task_);
+        // TODO should we use device callback or device events??
+        // insert device callback
+        TiledArray::device::launchHostFunc(*stream, device_callback, task_);
         // reset stream to nullptr
-        TiledArray::synchronize_stream(nullptr);
+        TiledArray::device::synchronize_stream(nullptr);
       }
     }
 
    private:
-    static void CUDART_CB cuda_callback(void* userData) {
+    static void DEVICERT_CB device_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
       const auto t0 = TiledArray::now();
       // convert void * to AsyncTaskInterface*
-      auto* callback = static_cast<cudaTaskFn_*>(userData);
+      auto* callback = static_cast<deviceTaskFn_*>(userData);
       //      std::stringstream address;
       //      address << (void*) callback;
-      //      std::string message = "callback on cudaTaskFn: " + address.str() +
+      //      std::string message = "callback on deviceTaskFn: " + address.str()
+      //      +
       //        '\n'; std::cout << message;
       callback->notify();
       const auto t1 = TiledArray::now();
 
-      TiledArray::detail::cuda_taskfn_callback_duration_ns() +=
+      TiledArray::detail::device_taskfn_callback_duration_ns() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    cudaTaskFn_* task_;
+    deviceTaskFn_* task_;
   };
 
  public:
@@ -160,7 +160,7 @@ struct cudaTaskFn : public TaskInterface {
   futureT result_;             ///< The task Future result
   const functionT func_;       ///< The task function
   TaskInterface* async_task_;  ///< The internal AsyncTaskInterface that wraps
-  ///< the async cuda function
+  ///< the async device function
   futureT async_result_;  ///< the future returned from the async task
 
   // If the value of the argument is known at the time the
@@ -258,7 +258,7 @@ struct cudaTaskFn : public TaskInterface {
 
   /// Check dependencies and register callbacks where necessary
   void check_dependencies() {
-    this->inc();  // the current cudaTaskFn depends on the internal
+    this->inc();  // the current deviceTaskFn depends on the internal
     // AsyncTaskInterface, dependency = 1
     check_dependency(arg1_);
     check_dependency(arg2_);
@@ -272,13 +272,14 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   // Copies are not allowed.
-  cudaTaskFn(const cudaTaskFn_&);
-  cudaTaskFn_ operator=(cudaTaskFn_&);
+  deviceTaskFn(const deviceTaskFn_&);
+  deviceTaskFn_ operator=(deviceTaskFn_&);
 
  public:
 #if MADNESS_TASKQ_VARIADICS
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -298,8 +299,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -319,8 +320,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -340,8 +341,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -361,8 +362,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T, typename a4T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -383,8 +384,8 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -405,8 +406,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -427,9 +429,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -450,9 +452,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -474,9 +476,9 @@ struct cudaTaskFn : public TaskInterface {
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T,
             typename a9T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
-             a9T&& a9, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
+               a9T&& a9, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -495,8 +497,9 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr,
-             archive::BufferInputArchive& input_arch)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr,
+               archive::BufferInputArchive& input_arch)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -514,7 +517,8 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 #else   // MADNESS_TASKQ_VARIADICS
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -534,8 +538,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -555,8 +559,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const TaskAttributes& attr = TaskAttributes())
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const TaskAttributes& attr = TaskAttributes())
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -576,8 +580,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -597,9 +601,9 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T, typename a4T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -620,9 +624,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -643,9 +647,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -666,9 +670,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -689,10 +693,10 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const a8T& a8,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const a8T& a8,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -714,10 +718,10 @@ struct cudaTaskFn : public TaskInterface {
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T,
             typename a9T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -736,8 +740,9 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr,
-             archive::BufferInputArchive& input_arch)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr,
+               archive::BufferInputArchive& input_arch)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -757,7 +762,7 @@ struct cudaTaskFn : public TaskInterface {
 #endif  // MADNESS_TASKQ_VARIADICS
 
   // no need to delete async_task_, as it will be deleted by the TaskQueue
-  virtual ~cudaTaskFn() = default;
+  virtual ~deviceTaskFn() = default;
 
   const futureT& result() const { return result_; }
 
@@ -770,16 +775,16 @@ struct cudaTaskFn : public TaskInterface {
   }
 #else
  protected:
-  /// when this cudaTaskFn gets run, it means the AsyncTaskInterface is done
+  /// when this deviceTaskFn gets run, it means the AsyncTaskInterface is done
   /// set the result with async_result_, which is finished
   void run(const TaskThreadEnv& env) override {
     result_.set(std::move(async_result_));
   }
 #endif  // HAVE_INTEL_TBB
 
-};  // class cudaTaskFn
+};  // class deviceTaskFn
 
-/// add a cudaTaskFn object to World
+/// add a deviceTaskFn object to World
 /// \tparam fnT A function pointer or functor
 /// \tparam a1T Type of argument 1.
 /// \tparam a2T Type of argument 2.
@@ -794,15 +799,15 @@ struct cudaTaskFn : public TaskInterface {
 /// \return Description needed.
 template <typename fnT, typename a1T, typename a2T, typename a3T, typename a4T,
           typename a5T, typename a6T, typename a7T, typename a8T, typename a9T>
-typename cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
-add_cuda_taskfn(
+typename deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
+add_device_taskfn(
     madness::World& world,
-    cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>* t) {
-  typename cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
-      res(t->result());
-  // add the internal async task in cuda task as well
+    deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>* t) {
+  typename deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T,
+                        a9T>::futureT res(t->result());
+  // add the internal async task in device task as well
   world.taskq.add(t->async_task());
-  // add the cuda task
+  // add the device task
   world.taskq.add(static_cast<TaskInterface*>(t));
   return res;
 }
@@ -815,13 +820,13 @@ template <
     typename fnT, typename... argsT,
     typename = std::enable_if_t<!meta::taskattr_is_last_arg<argsT...>::value>>
 typename detail::function_enabler<fnT(future_to_ref_t<argsT>...)>::type
-add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
-  /// type of cudaTaskFn object
+add_device_task(madness::World& world, fnT&& fn, argsT&&... args) {
+  /// type of deviceTaskFn object
   using taskT =
-      cudaTaskFn<std::decay_t<fnT>,
-                 std::remove_const_t<std::remove_reference_t<argsT>>...>;
+      deviceTaskFn<std::decay_t<fnT>,
+                   std::remove_const_t<std::remove_reference_t<argsT>>...>;
 
-  return add_cuda_taskfn(
+  return add_device_taskfn(
       world, new taskT(typename taskT::futureT(), std::forward<fnT>(fn),
                        std::forward<argsT>(args)..., TaskAttributes()));
 }
@@ -835,13 +840,13 @@ template <
     typename = std::enable_if_t<meta::taskattr_is_last_arg<argsT...>::value>>
 typename meta::drop_last_arg_and_apply_callable<
     detail::function_enabler, fnT, future_to_ref_t<argsT>...>::type::type
-add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
-  /// type of cudaTaskFn object
+add_device_task(madness::World& world, fnT&& fn, argsT&&... args) {
+  /// type of deviceTaskFn object
   using taskT = typename meta::drop_last_arg_and_apply<
-      cudaTaskFn, std::decay_t<fnT>,
+      deviceTaskFn, std::decay_t<fnT>,
       std::remove_const_t<std::remove_reference_t<argsT>>...>::type;
 
-  return add_cuda_taskfn(
+  return add_device_taskfn(
       world, new taskT(typename taskT::futureT(), std::forward<fnT>(fn),
                        std::forward<argsT>(args)...));
 }
@@ -852,14 +857,14 @@ add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
 /// \tparam argsT   variadic template for arguments
 /// \return A future to the result
 template <typename objT, typename memfnT, typename... argsT>
-typename detail::memfunc_enabler<objT, memfnT>::type add_cuda_task(
+typename detail::memfunc_enabler<objT, memfnT>::type add_device_task(
     madness::World& world, objT&& obj, memfnT memfn, argsT&&... args) {
-  return add_cuda_task(world,
-                       detail::wrap_mem_fn(std::forward<objT>(obj), memfn),
-                       std::forward<argsT>(args)...);
+  return add_device_task(world,
+                         detail::wrap_mem_fn(std::forward<objT>(obj), memfn),
+                         std::forward<argsT>(args)...);
 }
 
 }  // namespace madness
 
-#endif  // TILDARRAY_HAS_CUDA
+#endif  // TILDARRAY_HAS_DEVICE
 #endif  // TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/device/kernel/mult_kernel.cu
similarity index 96%
rename from src/TiledArray/cuda/kernel/mult_kernel.cu
rename to src/TiledArray/device/kernel/mult_kernel.cu
index aa3cadbc72..ca2d86d4b9 100644
--- a/src/TiledArray/cuda/kernel/mult_kernel.cu
+++ b/src/TiledArray/device/kernel/mult_kernel.cu
@@ -21,8 +21,8 @@
  *
  */
 
-#include <TiledArray/cuda/kernel/mult_kernel.h>
-#include <TiledArray/cuda/kernel/mult_kernel_impl.h>
+#include <TiledArray/device/kernel/mult_kernel.h>
+#include <TiledArray/device/kernel/mult_kernel_impl.h>
 
 
 #ifdef TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h
similarity index 100%
rename from src/TiledArray/cuda/kernel/mult_kernel.h
rename to src/TiledArray/device/kernel/mult_kernel.h
diff --git a/src/TiledArray/cuda/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/mult_kernel_impl.h
similarity index 95%
rename from src/TiledArray/cuda/kernel/mult_kernel_impl.h
rename to src/TiledArray/device/kernel/mult_kernel_impl.h
index b237dfab1e..a1c217ce3d 100644
--- a/src/TiledArray/cuda/kernel/mult_kernel_impl.h
+++ b/src/TiledArray/device/kernel/mult_kernel_impl.h
@@ -34,7 +34,7 @@ namespace TiledArray {
 template <typename T>
 void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n,
                               cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
+  DeviceSafeCall(device::setDevice(device_id));
 
   thrust::multiplies<T> mul_op;
   thrust::transform(
@@ -47,7 +47,7 @@ void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n,
 template <typename T>
 void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2,
                            std::size_t n, cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
+  DeviceSafeCall(device::setDevice(device_id));
 
   thrust::multiplies<T> mul_op;
   thrust::transform(
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/device/kernel/reduce_kernel.cu
similarity index 98%
rename from src/TiledArray/cuda/kernel/reduce_kernel.cu
rename to src/TiledArray/device/kernel/reduce_kernel.cu
index d24669b920..a09b3f7a41 100644
--- a/src/TiledArray/cuda/kernel/reduce_kernel.cu
+++ b/src/TiledArray/device/kernel/reduce_kernel.cu
@@ -21,8 +21,8 @@
  *
  */
 
-#include <TiledArray/cuda/kernel/reduce_kernel.h>
-#include <TiledArray/cuda/kernel/reduce_kernel_impl.h>
+#include <TiledArray/device/kernel/reduce_kernel.h>
+#include <TiledArray/device/kernel/reduce_kernel_impl.h>
 
 
 #ifdef TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h
similarity index 100%
rename from src/TiledArray/cuda/kernel/reduce_kernel.h
rename to src/TiledArray/device/kernel/reduce_kernel.h
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/device/kernel/reduce_kernel_impl.h
similarity index 95%
rename from src/TiledArray/cuda/kernel/reduce_kernel_impl.h
rename to src/TiledArray/device/kernel/reduce_kernel_impl.h
index 9dc6507cca..f03e333dbb 100644
--- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
+++ b/src/TiledArray/device/kernel/reduce_kernel_impl.h
@@ -26,7 +26,7 @@
 
 #include <limits>
 
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/type_traits.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -57,7 +57,7 @@ struct absolute_value
 template <typename T, typename ReduceOp>
 T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init,
                           cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
+  DeviceSafeCall(device::setDevice(device_id));
 
   auto arg_p = thrust::device_pointer_cast(arg);
 
@@ -107,7 +107,7 @@ TiledArray::detail::scalar_t<T> absmax_reduce_cuda_kernel_impl(
   thrust::maximum<TR> max_op;
   detail::absolute_value<T> abs_op;
 
-  CudaSafeCall(cudaSetDevice(device_id));
+  DeviceSafeCall(device::setDevice(device_id));
 
   auto arg_p = thrust::device_pointer_cast(arg);
 
@@ -125,7 +125,7 @@ TiledArray::detail::scalar_t<T> absmin_reduce_cuda_kernel_impl(
   thrust::minimum<TR> min_op;
   detail::absolute_value<T> abs_op;
 
-  CudaSafeCall(cudaSetDevice(device_id));
+  DeviceSafeCall(device::setDevice(device_id));
 
   auto arg_p = thrust::device_pointer_cast(arg);
 
diff --git a/src/TiledArray/cuda/platform.h b/src/TiledArray/device/platform.h
similarity index 93%
rename from src/TiledArray/cuda/platform.h
rename to src/TiledArray/device/platform.h
index f94226b39e..9d0cac3cdd 100644
--- a/src/TiledArray/cuda/platform.h
+++ b/src/TiledArray/device/platform.h
@@ -31,9 +31,9 @@ enum class MemorySpace {
   // MemorySpace is represented as a bitfield to compute unions and
   // intersections easier
   Null = 0b00,
-  CPU = 0b01,
-  CUDA = 0b10,
-  CUDA_UM = CPU | CUDA  // union of CPU and CUDA spaces
+  Host = 0b01,
+  Device = 0b10,
+  Device_UM = Host | Device  // union of host and device spaces
 };
 
 // customization point: in_memory_space<S>(O) -> bool
@@ -55,7 +55,7 @@ constexpr bool overlap(MemorySpace space1, MemorySpace space2) {
 }
 
 /// enumerates the execution spaces
-enum class ExecutionSpace { CPU, CUDA };
+enum class ExecutionSpace { Host, Device };
 
 // customization point: to_execution_space<S>(O) -> void
 // "moves" O to execution space S
diff --git a/src/TiledArray/cuda/thrust.h b/src/TiledArray/device/thrust.h
similarity index 100%
rename from src/TiledArray/cuda/thrust.h
rename to src/TiledArray/device/thrust.h
diff --git a/src/TiledArray/cuda/um_storage.cu b/src/TiledArray/device/um_storage.cu
similarity index 66%
rename from src/TiledArray/cuda/um_storage.cu
rename to src/TiledArray/device/um_storage.cu
index a16811e91b..cc3a1aae55 100644
--- a/src/TiledArray/cuda/um_storage.cu
+++ b/src/TiledArray/device/um_storage.cu
@@ -22,29 +22,29 @@
  */
 
 
-#include <TiledArray/cuda/allocators.h>
-#include <TiledArray/cuda/thrust.h>
+#include <TiledArray/device/allocators.h>
+#include <TiledArray/device/thrust.h>
 
 #ifdef TILEDARRAY_HAS_CUDA
 
 namespace thrust {
 template<>
-void resize<double,TiledArray::cuda_um_allocator<double>>(
-    thrust::device_vector<double, TiledArray::cuda_um_allocator<double>>& dev_vec,
+void resize<double,TiledArray::device_um_allocator<double>>(
+    thrust::device_vector<double, TiledArray::device_um_allocator<double>>& dev_vec,
     size_t size) {
     dev_vec.resize(size);
 }
 template<>
-void resize<float,TiledArray::cuda_um_allocator<float>>(
-    thrust::device_vector<float, TiledArray::cuda_um_allocator<float>>& dev_vec,
+void resize<float,TiledArray::device_um_allocator<float>>(
+    thrust::device_vector<float, TiledArray::device_um_allocator<float>>& dev_vec,
     size_t size) {
     dev_vec.resize(size);
 }
 }
 
 namespace thrust {
-template class device_vector<double, TiledArray::cuda_um_allocator<double>>;
-template class device_vector<float, TiledArray::cuda_um_allocator<float>>;
+template class device_vector<double, TiledArray::device_um_allocator<double>>;
+template class device_vector<float, TiledArray::device_um_allocator<float>>;
 }
 
 #endif //TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/device/um_storage.h
similarity index 78%
rename from src/TiledArray/cuda/um_storage.h
rename to src/TiledArray/device/um_storage.h
index bea591cbb2..e4318e5666 100644
--- a/src/TiledArray/cuda/um_storage.h
+++ b/src/TiledArray/device/um_storage.h
@@ -24,15 +24,15 @@
 #ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED
 #define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED
 
-#include <TiledArray/cuda/allocators.h>
-#include <TiledArray/cuda/thrust.h>
+#include <TiledArray/device/allocators.h>
+#include <TiledArray/device/thrust.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
 #include <btas/array_adaptor.h>
 #include <btas/varray/varray.h>
 
-#include <TiledArray/cuda/platform.h>
+#include <TiledArray/device/platform.h>
 #include <TiledArray/utility.h>
 
 #include <madness/world/archive.h>
@@ -41,40 +41,41 @@ namespace TiledArray {
 
 template <typename T>
 using cuda_um_thrust_vector =
-    thrust::device_vector<T, TiledArray::cuda_um_allocator<T>>;
+    thrust::device_vector<T, TiledArray::device_um_allocator<T>>;
 
 /// @return true if @c dev_vec is present in space @space
 template <MemorySpace Space, typename Storage>
 bool in_memory_space(const Storage& vec) noexcept {
-  return overlap(MemorySpace::CUDA_UM, Space);
+  return overlap(MemorySpace::Device_UM, Space);
 }
 
 /**
  * @tparam Space
- * @tparam Storage  the Storage type of the vector, such as cuda_um_btas_varray
+ * @tparam Storage  the Storage type of the vector, such as
+ * device_um_btas_varray
  */
 template <ExecutionSpace Space, typename Storage>
 void to_execution_space(Storage& vec, cudaStream_t stream = 0) {
   switch (Space) {
-    case ExecutionSpace::CPU: {
+    case ExecutionSpace::Host: {
       using std::data;
       using std::size;
       using value_type = typename Storage::value_type;
-      if (cudaEnv::instance()->concurrent_managed_access()) {
-        CudaSafeCall(cudaMemPrefetchAsync(data(vec),
-                                          size(vec) * sizeof(value_type),
-                                          cudaCpuDeviceId, stream));
+      if (deviceEnv::instance()->concurrent_managed_access()) {
+        DeviceSafeCall(cudaMemPrefetchAsync(data(vec),
+                                            size(vec) * sizeof(value_type),
+                                            cudaCpuDeviceId, stream));
       }
       break;
     }
-    case ExecutionSpace::CUDA: {
+    case ExecutionSpace::Device: {
       using std::data;
       using std::size;
       using value_type = typename Storage::value_type;
       int device = -1;
-      if (cudaEnv::instance()->concurrent_managed_access()) {
-        CudaSafeCall(cudaGetDevice(&device));
-        CudaSafeCall(cudaMemPrefetchAsync(
+      if (deviceEnv::instance()->concurrent_managed_access()) {
+        DeviceSafeCall(cudaGetDevice(&device));
+        DeviceSafeCall(cudaMemPrefetchAsync(
             data(vec), size(vec) * sizeof(value_type), device, stream));
       }
       break;
@@ -95,8 +96,8 @@ template <typename Storage>
 void make_device_storage(Storage& storage, std::size_t n,
                          const cudaStream_t& stream = 0) {
   storage = Storage(n);
-  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(storage,
-                                                                   stream);
+  TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(storage,
+                                                                     stream);
 }
 
 /**
@@ -131,7 +132,7 @@ struct ArchiveLoadImpl<Archive, TiledArray::cuda_um_thrust_vector<T>> {
   static inline void load(const Archive& ar,
                           TiledArray::cuda_um_thrust_vector<T>& x) {
     typename thrust::device_vector<
-        T, TiledArray::cuda_um_allocator<T>>::size_type n(0);
+        T, TiledArray::device_um_allocator<T>>::size_type n(0);
     ar& n;
     x.resize(n);
     for (auto& xi : x) ar& xi;
diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h
index a4c203d3dd..fa33d74d9c 100644
--- a/src/TiledArray/dist_eval/binary_eval.h
+++ b/src/TiledArray/dist_eval/binary_eval.h
@@ -123,12 +123,12 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
  private:
   /// Task function for evaluating tiles
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// \param i The tile index
   /// \param left The left-hand tile
   /// \param right The right-hand tile
   template <typename L, typename R, typename U = value_type>
-  std::enable_if_t<!detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<!detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, L left, R right) {
     DistEvalImpl_::set_tile(i, op_(left, right));
   }
@@ -137,11 +137,11 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   /// \param left The left-hand tile
   /// \param right The right-hand tile
   template <typename L, typename R, typename U = value_type>
-  std::enable_if_t<detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, L left, R right) {
     // TODO avoid copy the Op object
     auto result_tile =
-        madness::add_cuda_task(DistEvalImpl_::world(), op_, left, right);
+        madness::add_device_task(DistEvalImpl_::world(), op_, left, right);
     DistEvalImpl_::set_tile(i, result_tile);
   }
 #else
diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
index ae3b456bc0..0d07821a4e 100644
--- a/src/TiledArray/dist_eval/contraction_eval.h
+++ b/src/TiledArray/dist_eval/contraction_eval.h
@@ -31,11 +31,11 @@
 
 #include <TiledArray/tensor/type_traits.h>
 
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
 
 namespace TiledArray {
 namespace detail {
@@ -479,7 +479,7 @@ class Summa
   static typename std::enable_if<
       is_lazy_tile<typename Arg::value_type>::value
 #ifdef TILEDARRAY_HAS_CUDA
-          && !detail::is_cuda_tile_v<typename Arg::value_type>
+          && !detail::is_device_tile_v<typename Arg::value_type>
 #endif
       ,
       Future<typename Arg::eval_type>>::type
@@ -502,13 +502,14 @@ class Summa
   template <typename Arg>
   static typename std::enable_if<
       is_lazy_tile<typename Arg::value_type>::value &&
-          detail::is_cuda_tile_v<typename Arg::value_type>,
+          detail::is_device_tile_v<typename Arg::value_type>,
       Future<typename Arg::eval_type>>::type
   get_tile(Arg& arg, const typename Arg::ordinal_type index) {
     auto convert_tile_fn =
         &Summa_::template convert_tile<typename Arg::value_type>;
-    return madness::add_cuda_task(arg.world(), convert_tile_fn, arg.get(index),
-                                  madness::TaskAttributes::hipri());
+    return madness::add_device_task(arg.world(), convert_tile_fn,
+                                    arg.get(index),
+                                    madness::TaskAttributes::hipri());
   }
 #endif
 
diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h
index b1056e0ac1..b45a85bda0 100644
--- a/src/TiledArray/dist_eval/dist_eval.h
+++ b/src/TiledArray/dist_eval/dist_eval.h
@@ -26,7 +26,7 @@
 #include <TiledArray/tensor_impl.h>
 #include <TiledArray/type_traits.h>
 #ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
+#include <TiledArray/device/device_task_fn.h>
 #include <TiledArray/external/cuda.h>
 #endif
 
diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h
index b3707b92c2..32052d2700 100644
--- a/src/TiledArray/dist_eval/unary_eval.h
+++ b/src/TiledArray/dist_eval/unary_eval.h
@@ -115,18 +115,18 @@ class UnaryEvalImpl
   /// \param i The tile index
   /// \param tile The tile to be evaluated
   template <typename U = value_type>
-  std::enable_if_t<detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, tile_argument_type tile) {
     // TODO avoid copy Op object
     auto result_tile =
-        madness::add_cuda_task(DistEvalImpl_::world(), op_, tile);
+        madness::add_device_task(DistEvalImpl_::world(), op_, tile);
     DistEvalImpl_::set_tile(i, result_tile);
   }
 
   /// \param i The tile index
   /// \param tile The tile to be evaluated
   template <typename U = value_type>
-  std::enable_if_t<!detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<!detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, tile_argument_type tile) {
     DistEvalImpl_::set_tile(i, op_(tile));
   }
diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h
index 1a7bc2ff05..d2bbe7c673 100644
--- a/src/TiledArray/expressions/expr.h
+++ b/src/TiledArray/expressions/expr.h
@@ -41,7 +41,7 @@
 #include "TiledArray/tile_interface/trace.h"
 #include "expr_engine.h"
 #ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
+#include <TiledArray/device/device_task_fn.h>
 #include <TiledArray/external/cuda.h>
 #endif
 
@@ -187,7 +187,7 @@ class Expr {
       typename std::enable_if<!std::is_same<typename A::value_type, T>::value &&
                               is_lazy_tile<T>::value
 #ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I& index, const Future<T>& tile) const {
@@ -210,9 +210,9 @@ class Expr {
             typename std::enable_if<
                 !std::is_same<typename A::value_type, T>::value &&
                 is_lazy_tile<T>::value &&
-                ::TiledArray::detail::is_cuda_tile_v<T>>::type* = nullptr>
+                ::TiledArray::detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I& index, const Future<T>& tile) const {
-    array.set(index, madness::add_cuda_task(
+    array.set(index, madness::add_device_task(
                          array.world(),
                          TiledArray::Cast<typename A::value_type, T>(), tile));
   }
@@ -247,7 +247,7 @@ class Expr {
       typename A, typename I, typename T, typename Op,
       typename std::enable_if<!std::is_same<typename A::value_type, T>::value
 #ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
@@ -275,14 +275,14 @@ class Expr {
   template <typename A, typename I, typename T, typename Op,
             typename std::enable_if<
                 !std::is_same<typename A::value_type, T>::value &&
-                ::TiledArray::detail::is_cuda_tile_v<T>>::type* = nullptr>
+                ::TiledArray::detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
     auto eval_tile_fn = &Expr_::template eval_tile<
         typename A::value_type, const T&,
         TiledArray::Cast<typename Op::argument_type, T>, Op>;
     array.set(index,
-              madness::add_cuda_task(
+              madness::add_device_task(
                   array.world(), eval_tile_fn, tile,
                   TiledArray::Cast<typename Op::argument_type, T>(), op));
   }
@@ -304,7 +304,7 @@ class Expr {
       typename A, typename I, typename T, typename Op,
       typename std::enable_if<std::is_same<typename A::value_type, T>::value
 #ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
@@ -332,7 +332,7 @@ class Expr {
   template <typename A, typename I, typename T, typename Op,
             typename std::enable_if<
                 std::is_same<typename A::value_type, T>::value&& ::TiledArray::
-                    detail::is_cuda_tile_v<T>>::type* = nullptr>
+                    detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
     auto eval_tile_fn_ptr = &Expr_::template eval_tile<const T&, Op>;
@@ -340,8 +340,8 @@ class Expr {
     static_assert(madness::detail::function_traits<fn_ptr_type(
                       const T&, const std::shared_ptr<Op>&)>::value,
                   "ouch");
-    array.set(index, madness::add_cuda_task(array.world(), eval_tile_fn_ptr,
-                                            tile, op));
+    array.set(index, madness::add_device_task(array.world(), eval_tile_fn_ptr,
+                                              tile, op));
   }
 #endif
 
diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h
index 1d169b1098..7a6e4d50e1 100644
--- a/src/TiledArray/external/cuda.h
+++ b/src/TiledArray/external/cuda.h
@@ -1,509 +1,3 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  July 23, 2018
- *
- */
-
-#ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
-#define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
-
-#include <cassert>
-#include <cstdlib>
-#include <vector>
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <nvToolsExt.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-#include <TiledArray/external/umpire.h>
-
-#include <TiledArray/external/madness.h>
-#include <madness/world/print.h>
-#include <madness/world/safempi.h>
-#include <madness/world/thread.h>
-
-#include <TiledArray/error.h>
-
-#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
-#define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
-#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
-
-inline void __cudaSafeCall(cudaError err, const char* file, const int line) {
-  if (cudaSuccess != err) {
-    std::stringstream ss;
-    ss << "cudaSafeCall() failed at: " << file << ":" << line;
-    std::string what = ss.str();
-    throw thrust::system_error(err, thrust::cuda_category(), what);
-  }
-}
-
-inline void __cudaSafeCallNoThrow(cudaError err, const char* file,
-                                  const int line) {
-  if (cudaSuccess != err) {
-    madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line);
-  }
-}
-
-inline void __cudaCheckError(const char* file, const int line) {
-  cudaError err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    std::stringstream ss;
-    ss << "cudaCheckError() failed at: " << file << ":" << line;
-    std::string what = ss.str();
-    throw thrust::system_error(err, thrust::cuda_category(), what);
-  }
-}
-
-namespace TiledArray {
-
-namespace detail {
-
-inline int num_cuda_streams() {
-  int num_streams = -1;
-  char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS");
-  /// default num of streams is 3
-  if (num_stream_char) {
-    num_streams = std::atoi(num_stream_char);
-  } else {
-    num_streams = 3;
-  }
-  return num_streams;
-}
-
-inline int num_cuda_devices() {
-  int num_devices = -1;
-  CudaSafeCall(cudaGetDeviceCount(&num_devices));
-  return num_devices;
-}
-
-inline int current_cuda_device_id(World& world) {
-  int mpi_local_size = -1;
-  int mpi_local_rank = -1;
-  std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world);
-
-  int num_devices = detail::num_cuda_devices();
-
-  int cuda_device_id = -1;
-  // devices may already be pre-mapped
-  // if mpi_local_size <= num_devices : all ranks are in same resource set, map
-  // round robin
-  if (mpi_local_size <= num_devices) {
-    cuda_device_id = mpi_local_rank % num_devices;
-  } else {  // mpi_local_size > num_devices
-    char* cvd_cstr = std::getenv("CUDA_VISIBLE_DEVICES");
-    if (cvd_cstr) {  // CUDA_VISIBLE_DEVICES is set, assume that pre-mapped
-      // make sure that there is only 1 device available here
-      if (num_devices != 1) {
-        throw std::runtime_error(
-            std::string(
-                "CUDA_VISIBLE_DEVICES environment variable is set, hence using "
-                "the provided device-to-rank mapping; BUT TiledArray found ") +
-            std::to_string(num_devices) +
-            " CUDA devices; only 1 CUDA device / MPI process is supported");
-      }
-      cuda_device_id = 0;
-    } else {  // not enough devices + devices are not pre-mapped
-      throw std::runtime_error(
-          std::string("TiledArray found ") + std::to_string(mpi_local_size) +
-          " MPI ranks on a node with " + std::to_string(num_devices) +
-          " CUDA devices; only 1 MPI process / CUDA device model is currently "
-          "supported");
-    }
-  }
-
-  return cuda_device_id;
-}
-
-inline void CUDART_CB cuda_readyflag_callback(void* userData) {
-  // convert void * to std::atomic<bool>
-  std::atomic<bool>* flag = static_cast<std::atomic<bool>*>(userData);
-  // set the flag to be true
-  flag->store(true);
-}
-
-struct ProbeFlag {
-  ProbeFlag(std::atomic<bool>* f) : flag(f) {}
-
-  bool operator()() const { return flag->load(); }
-
-  std::atomic<bool>* flag;
-};
-
-inline void thread_wait_cuda_stream(const cudaStream_t& stream) {
-  std::atomic<bool>* flag = new std::atomic<bool>(false);
-
-  CudaSafeCall(
-      cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag));
-
-  detail::ProbeFlag probe(flag);
-
-  // wait with sleep and do not do work
-  madness::ThreadPool::await(probe, false, true);
-  //    madness::ThreadPool::await(probe, true, true);
-
-  delete flag;
-}
-
-}  // namespace detail
-
-inline const cudaStream_t*& tls_cudastream_accessor() {
-  static thread_local const cudaStream_t* thread_local_stream_ptr{nullptr};
-  return thread_local_stream_ptr;
-}
-
-inline void synchronize_stream(const cudaStream_t* stream) {
-  tls_cudastream_accessor() = stream;
-}
-
-/**
- * cudaEnv maintains the CUDA-related part of the runtime environment,
- * such as CUDA-specific memory allocators
- *
- * \note this is a Singleton
- */
-class cudaEnv {
- public:
-  ~cudaEnv() {
-    // destroy cuda streams on current device
-    for (auto& stream : cuda_streams_) {
-      CudaSafeCallNoThrow(cudaStreamDestroy(stream));
-    }
-  }
-
-  cudaEnv(const cudaEnv&) = delete;
-  cudaEnv(cudaEnv&&) = delete;
-  cudaEnv& operator=(const cudaEnv&) = delete;
-  cudaEnv& operator=(cudaEnv&&) = delete;
-
-  /// access the singleton instance; if not initialized will be
-  /// initialized via cudaEnv::initialize() with the default params
-  static std::unique_ptr<cudaEnv>& instance() {
-    if (!instance_accessor()) {
-      initialize();
-    }
-    return instance_accessor();
-  }
-
-  // clang-format off
-  /// initialize the instance using explicit params
-  /// \param world the world to use for initialization
-  /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least
-  ///                  this size (bytes) [default=2^25]
-  /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that
-  ///        allocator returned by `this->pinned_allocator()` can allocate;
-  ///        this allocator is not used by default [default=0]
-  // clang-format on
-  static void initialize(World& world = TiledArray::get_default_world(),
-                         const std::uint64_t page_size = (1ul << 25),
-                         const std::uint64_t pinned_alloc_limit = (1ul << 40)) {
-    static std::mutex mtx;  // to make initialize() reentrant
-    std::scoped_lock lock{mtx};
-    // only the winner of the lock race gets to initialize
-    if (instance_accessor() == nullptr) {
-      int num_streams = detail::num_cuda_streams();
-      int num_devices = detail::num_cuda_devices();
-      int device_id = detail::current_cuda_device_id(world);
-      // set device for current MPI process .. will be set in the ctor as well
-      CudaSafeCall(cudaSetDevice(device_id));
-      CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
-
-      // uncomment to debug umpire ops
-      //
-      //      umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel(
-      //          umpire::util::message::Debug);
-
-      //       make Thread Safe UM Dynamic POOL
-
-      auto& rm = umpire::ResourceManager::getInstance();
-
-      auto mem_total_free = cudaEnv::memory_total_and_free_device();
-
-      // turn off Umpire introspection for non-Debug builds
-#ifndef NDEBUG
-      constexpr auto introspect = true;
-#else
-      constexpr auto introspect = false;
-#endif
-
-      // allocate all currently-free memory for UM pool
-      auto um_dynamic_pool =
-          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second,
-              pinned_alloc_limit);
-
-      // allocate zero memory for device pool
-      auto dev_size_limited_alloc =
-          rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
-              "size_limited_alloc", rm.getAllocator("DEVICE"),
-              mem_total_free.first);
-      auto dev_dynamic_pool =
-          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "CUDADynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit);
-
-      // allocate pinned_alloc_limit in pinned memory
-      auto pinned_size_limited_alloc =
-          rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
-              "SizeLimited_PINNED", rm.getAllocator("PINNED"),
-              pinned_alloc_limit);
-      auto pinned_dynamic_pool =
-          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc,
-              page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE);
-
-      auto cuda_env = std::unique_ptr<cudaEnv>(
-          new cudaEnv(world, num_devices, device_id, num_streams,
-                      um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool));
-      instance_accessor() = std::move(cuda_env);
-    }
-  }
-
-  World& world() const { return *world_; }
-
-  int num_cuda_devices() const { return num_cuda_devices_; }
-
-  int current_cuda_device_id() const { return current_cuda_device_id_; }
-
-  int num_cuda_streams() const { return num_cuda_streams_; }
-
-  bool concurrent_managed_access() const {
-    return cuda_device_concurrent_managed_access_;
-  }
-
-  size_t stream_id(const cudaStream_t& stream) const {
-    auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream);
-    if (it == cuda_streams_.end()) abort();
-    return it - cuda_streams_.begin();
-  }
-
-  /// @return the total size of all and free device memory on the current device
-  static std::pair<size_t, size_t> memory_total_and_free_device() {
-    std::pair<size_t, size_t> result;
-    // N.B. cudaMemGetInfo returns {free,total}
-    CudaSafeCall(cudaMemGetInfo(&result.second, &result.first));
-    return result;
-  }
-
-  /// Collective call to probe CUDA {total,free} memory
-
-  /// @return the total size of all and free device memory on every rank's
-  /// device
-  std::vector<std::pair<size_t, size_t>> memory_total_and_free() const {
-    auto world_size = world_->size();
-    std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
-    auto rank = world_->rank();
-    std::tie(total_memory.at(rank), free_memory.at(rank)) =
-        cudaEnv::memory_total_and_free_device();
-    world_->gop.sum(total_memory.data(), total_memory.size());
-    world_->gop.sum(free_memory.data(), free_memory.size());
-    std::vector<std::pair<size_t, size_t>> result(world_size);
-    for (int r = 0; r != world_size; ++r) {
-      result.at(r) = {total_memory.at(r), free_memory.at(r)};
-    }
-    return result;
-  }
-
-  const cudaStream_t& cuda_stream(std::size_t i) const {
-    return cuda_streams_.at(i);
-  }
-
-  const cudaStream_t& cuda_stream_h2d() const {
-    return cuda_streams_[num_cuda_streams_];
-  }
-
-  const cudaStream_t& cuda_stream_d2h() const {
-    return cuda_streams_[num_cuda_streams_ + 1];
-  }
-
-  /// @return a (non-thread-safe) Umpire allocator for CUDA UM
-  umpire::Allocator& um_allocator() { return um_allocator_; }
-
-  // clang-format off
-  /// @return the max actual amount of memory held by um_allocator()
-  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
-  /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()`
-  // clang-format on
-  std::size_t um_allocator_getActualHighWatermark() {
-    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
-                  um_allocator_.getAllocationStrategy()) != nullptr);
-    return dynamic_cast<umpire::strategy::QuickPool*>(
-               um_allocator_.getAllocationStrategy())
-        ->getActualHighwaterMark();
-  }
-
-  /// @return a (non-thread-safe) Umpire allocator for CUDA device memory
-  umpire::Allocator& device_allocator() { return device_allocator_; }
-
-  // clang-format off
-  /// @return the max actual amount of memory held by um_allocator()
-  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
-  /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()`
-  // clang-format on
-  std::size_t device_allocator_getActualHighWatermark() {
-    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
-                  device_allocator_.getAllocationStrategy()) != nullptr);
-    return dynamic_cast<umpire::strategy::QuickPool*>(
-               device_allocator_.getAllocationStrategy())
-        ->getActualHighwaterMark();
-  }
-
-  /// @return an Umpire allocator that allocates from a
-  ///         pinned memory pool
-  /// @warning this is not a thread-safe allocator, should be only used when
-  ///          wrapped into umpire_allocator_impl
-  umpire::Allocator& pinned_allocator() { return pinned_allocator_; }
-
-  // clang-format off
-  /// @return the max actual amount of memory held by pinned_allocator()
-  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
-  /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()`
-  // clang-format on
-  std::size_t pinned_allocator_getActualHighWatermark() {
-    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
-                  pinned_allocator_.getAllocationStrategy()) != nullptr);
-    return dynamic_cast<umpire::strategy::QuickPool*>(
-               pinned_allocator_.getAllocationStrategy())
-        ->getActualHighwaterMark();
-  }
-
- protected:
-  cudaEnv(World& world, int num_devices, int device_id, int num_streams,
-          umpire::Allocator um_alloc, umpire::Allocator device_alloc,
-          umpire::Allocator pinned_alloc)
-      : world_(&world),
-        um_allocator_(um_alloc),
-        device_allocator_(device_alloc),
-        pinned_allocator_(pinned_alloc),
-        num_cuda_devices_(num_devices),
-        current_cuda_device_id_(device_id),
-        num_cuda_streams_(num_streams) {
-    if (num_devices <= 0) {
-      throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
-    }
-
-    // set device for current MPI process
-    CudaSafeCall(cudaSetDevice(current_cuda_device_id_));
-
-    /// check the capability of CUDA device
-    cudaDeviceProp prop;
-    CudaSafeCall(cudaGetDeviceProperties(&prop, device_id));
-    if (!prop.managedMemory) {
-      throw std::runtime_error("CUDA Device doesn't support managedMemory\n");
-    }
-    int concurrent_managed_access;
-    CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access,
-                                        cudaDevAttrConcurrentManagedAccess,
-                                        device_id));
-    cuda_device_concurrent_managed_access_ = concurrent_managed_access;
-    if (!cuda_device_concurrent_managed_access_) {
-      std::cout << "\nWarning: CUDA Device doesn't support "
-                   "ConcurrentManagedAccess!\n\n";
-    }
-
-    // creates cuda streams on current device
-    cuda_streams_.resize(num_cuda_streams_ + 2);
-    for (auto& stream : cuda_streams_) {
-      CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    }
-    std::cout << "created " << num_cuda_streams_
-              << " CUDA streams + 2 I/O streams" << std::endl;
-  }
-
- private:
-  // the world used to initialize this
-  World* world_;
-
-  /// allocator backed by a (non-thread-safe) dynamically-sized pool for CUDA UM
-  umpire::Allocator um_allocator_;
-  /// allocator backed by a (non-thread-safe) dynamically-sized pool for device
-  /// memory
-  umpire::Allocator device_allocator_;
-  // allocates from a dynamic, size-limited pinned memory pool
-  // N.B. not thread safe, so must be wrapped into umpire_allocator_impl
-  umpire::Allocator pinned_allocator_;
-
-  int num_cuda_devices_;
-  int current_cuda_device_id_;
-  bool cuda_device_concurrent_managed_access_;
-
-  int num_cuda_streams_;
-  std::vector<cudaStream_t> cuda_streams_;
-
-  inline static std::unique_ptr<cudaEnv>& instance_accessor() {
-    static std::unique_ptr<cudaEnv> instance_{nullptr};
-    return instance_;
-  }
-};
-
-namespace detail {
-
-template <typename Range>
-const cudaStream_t& get_stream_based_on_range(const Range& range) {
-  // TODO better way to get stream based on the id of tensor
-  auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams();
-  auto& stream = cudaEnv::instance()->cuda_stream(stream_id);
-  return stream;
-}
-
-}  // namespace detail
-
-namespace nvidia {
-
-// Color definitions for nvtxcalls
-enum class argbColor : uint32_t {
-  red = 0xFFFF0000,
-  blue = 0xFF0000FF,
-  green = 0xFF008000,
-  yellow = 0xFFFFFF00,
-  cyan = 0xFF00FFFF,
-  magenta = 0xFFFF00FF,
-  gray = 0xFF808080,
-  purple = 0xFF800080
-};
-
-/// enter a profiling range by calling nvtxRangePushEx
-/// \param[in] range_title a char string containing the range title
-/// \param[in] range_color the range color
-inline void range_push(const char* range_title, argbColor range_color) {
-  nvtxEventAttributes_t eventAttrib = {0};
-  eventAttrib.version = NVTX_VERSION;
-  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  eventAttrib.colorType = NVTX_COLOR_ARGB;
-  eventAttrib.color = static_cast<uint32_t>(range_color);
-  eventAttrib.message.ascii = range_title;
-  nvtxRangePushEx(&eventAttrib);
-}
-
-/// exits the current profiling range by calling nvtxRangePopEx
-inline void range_pop() { nvtxRangePop(); }
-
-}  // namespace nvidia
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
+#warning \
+    "This header is deprecated. Please use TiledArray/external/device.h instead."
+#include <TiledArray/external/device.h>
diff --git a/src/TiledArray/external/hip.h b/src/TiledArray/external/device.h
similarity index 53%
rename from src/TiledArray/external/hip.h
rename to src/TiledArray/external/device.h
index 75dbfc6955..5016d55b23 100644
--- a/src/TiledArray/external/hip.h
+++ b/src/TiledArray/external/device.h
@@ -21,8 +21,8 @@
  *
  */
 
-#ifndef TILEDARRAY_EXTERNAL_HIP_H__INCLUDED
-#define TILEDARRAY_EXTERNAL_HIP_H__INCLUDED
+#ifndef TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
+#define TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
 
 #include <cassert>
 #include <cstdlib>
@@ -30,9 +30,15 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_HIP
-
+#if defined(TILEDARRAY_HAS_HIP)
 #include <hip/hip_runtime.h>
+#elif defined(TILEDARRAY_HAS_CUDA)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif
 
 #include <TiledArray/external/umpire.h>
 
@@ -43,9 +49,40 @@
 
 #include <TiledArray/error.h>
 
-#define HipSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__)
-#define HipSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__)
-#define HipCheckError() __hipCheckError(__FILE__, __LINE__)
+#if defined(TILEDARRAY_HAS_CUDA)
+
+inline void __DeviceSafeCall(cudaError err, const char* file, const int line) {
+  if (cudaSuccess != err) {
+    std::stringstream ss;
+    ss << "DeviceSafeCall() failed at: " << file << ":" << line;
+    std::string what = ss.str();
+    throw thrust::system_error(err, thrust::cuda_category(), what);
+  }
+}
+
+inline void __cudaSafeCallNoThrow(cudaError err, const char* file,
+                                  const int line) {
+  if (cudaSuccess != err) {
+    madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line);
+  }
+}
+
+inline void __cudaCheckError(const char* file, const int line) {
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    std::stringstream ss;
+    ss << "cudaCheckError() failed at: " << file << ":" << line;
+    std::string what = ss.str();
+    throw thrust::system_error(err, thrust::cuda_category(), what);
+  }
+}
+
+#define DeviceSafeCall(err) __DeviceSafeCall(err, __FILE__, __LINE__)
+#define DeviceSafeCallNoThrow(err) \
+  __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
+#define DeviceCheckError() __cudaCheckError(__FILE__, __LINE__)
+
+#elif defined(TILEDARRAY_HAS_HIP)
 
 inline void __hipSafeCall(hipError_t err, const char* file, const int line) {
   if (hipSuccess != err) {
@@ -74,34 +111,158 @@ inline void __hipCheckError(const char* file, const int line) {
   }
 }
 
+#define DeviceSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__)
+#define DeviceSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__)
+#define DeviceCheckError() __hipCheckError(__FILE__, __LINE__)
+
+#endif
+
 namespace TiledArray {
+namespace device {
+
+#if defined(TILEDARRAY_HAS_CUDA)
+inline namespace cuda {
+using stream_t = cudaStream_t;
+using error_t = cudaError_t;
+using hostFn_t = cudaHostFn_t;
+using deviceProp_t = cudaDeviceProp;
+using deviceAttr_t = cudaDeviceAttr;
+#define DeviceAttributeConcurrentManagedAccess \
+  cudaDevAttrConcurrentManagedAccess
+#define DEVICERT_CB CUDART_CB
+
+enum FuncCache {
+  FuncCachePreferNone = cudaFuncCachePreferNone,
+  FuncCachePreferShared = cudaFuncCachePreferShared,
+  FuncCachePreferL1 = cudaFuncCachePreferL1,
+  FuncCachePreferEqual = cudaFuncCachePreferEqual
+};
 
-namespace detail {
+enum StreamCreateFlags {
+  StreamDefault = cudaStreamDefault,
+  StreamNonBlocking = cudaStreamNonBlocking
+};
+
+inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) {
+  return cudaLaunchHostFunc(stream, fn, userData);
+}
+inline error_t streamDestroy(stream_t stream) {
+  return cudaStreamDestroy(stream);
+}
+inline error_t setDevice(int device) { return device::setDevice(device); }
+inline error_t deviceSetCacheConfig(FuncCache cache_config) {
+  return cudaDeviceSetCacheConfig(static_cast<cudaFuncCache>(cache_config));
+}
+inline error_t memGetInfo(size_t* free, size_t* total) {
+  return cudaMemGetInfo(free, total);
+}
+inline error_t getDeviceProperties(deviceProp_t* prop, int device) {
+  return cudaGetDeviceProperties(prop, device);
+}
+inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) {
+  return cudaDeviceGetAttribute(value, attr, device);
+}
+inline error_t streamCreateWithFlags(stream_t* pStream,
+                                     StreamCreateFlags flags) {
+  return cudaStreamCreateWithFlags(pStream, flags);
+}
+inline error_t deviceSynchronize() { return cudaDeviceSynchronize(); }
+inline error_t streamSynchronize(stream_t stream) {
+  return cudaStreamSynchronize(stream);
+}
+}  // namespace cuda
+#elif defined(TILEDARRAY_HAS_HIP)
+inline namespace hip {
+using stream_t = hipStream_t;
+using error_t = hipError_t;
+using hostFn_t = hipHostFn_t;
+using deviceProp_t = hipDeviceProp;
+using deviceAttr_t = hipDeviceAttr;
+#define DeviceAttributeConcurrentManagedAccess \
+  hipDeviceAttributeConcurrentManagedAccess
+#define DEVICERT_CD HIPRT_CB
+
+enum FuncCache {
+  FuncCachePreferNone = hipFuncCachePreferNone,
+  FuncCachePreferShared = hipFuncCachePreferShared,
+  FuncCachePreferL1 = hipFuncCachePreferL1,
+  FuncCachePreferEqual = hipFuncCachePreferEqual
+};
+
+enum StreamCreateFlags {
+  StreamDefault = hipStreamDefault,
+  StreamNonBlocking = hipStreamNonBlocking
+};
+
+inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) {
+  return hipLaunchHostFunc(stream, fn, userData);
+}
+inline error_t streamDestroy(stream_t stream) {
+  return hipStreamDestroy(stream);
+}
+inline error_t setDevice(int device) { return hipSetDevice(device); }
+inline error_t deviceSetCacheConfig(FuncCache cache_config) {
+  return hipDeviceSetCacheConfig(static_cast<hipFuncCache>(cache_config));
+}
+inline error_t memGetInfo(size_t* free, size_t* total) {
+  return hipMemGetInfo(free, total);
+}
+inline error_t getDeviceProperties(deviceProp* prop, int device) {
+  return hipGetDeviceProperties(prop, device);
+}
+inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) {
+  return hipDeviceGetAttribute(value, attr, device);
+}
+inline error_t streamCreateWithFlags(stream_t* pStream,
+                                     StreamCreateFlags flags) {
+  return hipStreamCreateWithFlags(pStream, flags);
+}
+inline error_t deviceSynchronize() { return hipDeviceSynchronize(); }
+inline error_t streamSynchronize(stream_t stream) {
+  return hipStreamSynchronize(stream);
+}
+}  // namespace hip
+#endif
+
+#ifdef TILEDARRAY_HAS_DEVICE
 
 inline int num_streams() {
   int num_streams = -1;
-  char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS");
-  /// default num of streams is 3
+  char* num_stream_char = std::getenv("TA_DEVICE_NUM_STREAMS");
   if (num_stream_char) {
     num_streams = std::atoi(num_stream_char);
   } else {
-    num_streams = 3;
+#if defined(TILEDARRAY_HAS_CUDA)
+    char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS");
+#elif defined(TILEDARRAY_HAS_HIP)
+    char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS");
+#endif
+    if (num_stream_char) {
+      num_streams = std::atoi(num_stream_char);
+    } else {
+      /// default num of streams is 3
+      num_streams = 3;
+    }
   }
   return num_streams;
 }
 
 inline int num_devices() {
   int num_devices = -1;
-  HipSafeCall(hipGetDeviceCount(&num_devices));
+#if defined(TILEDARRAY_HAS_CUDA)
+  DeviceSafeCall(cudaGetDeviceCount(&num_devices));
+#elif defined(TILEDARRAY_HAS_HIP)
+  DeviceSafeCall(hipGetDeviceCount(&num_devices));
+#endif
   return num_devices;
 }
 
 inline int current_device_id(World& world) {
   int mpi_local_size = -1;
   int mpi_local_rank = -1;
-  std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world);
+  std::tie(mpi_local_rank, mpi_local_size) = detail::mpi_local_rank_size(world);
 
-  int num_devices = detail::num_devices();
+  int num_devices = device::num_devices();
 
   int device_id = -1;
   // devices may already be pre-mapped
@@ -110,23 +271,25 @@ inline int current_device_id(World& world) {
   if (mpi_local_size <= num_devices) {
     device_id = mpi_local_rank % num_devices;
   } else {  // mpi_local_size > num_devices
-    char* cvd_cstr = std::getenv("HIP_VISIBLE_DEVICES");
-    if (cvd_cstr) {  // HIP_VISIBLE_DEVICES is set, assume that pre-mapped
+    const char* vd_cstr =
+        std::getenv(TILEDARRAY_DEVICE_RUNTIME_STR "_VISIBLE_DEVICES");
+    if (vd_cstr) {  // *_VISIBLE_DEVICES is set, assume that pre-mapped
       // make sure that there is only 1 device available here
       if (num_devices != 1) {
         throw std::runtime_error(
             std::string(
-                "HIP_VISIBLE_DEVICES environment variable is set, hence using "
+                TILEDARRAY_DEVICE_RUNTIME_STR
+                "_VISIBLE_DEVICES environment variable is set, hence using "
                 "the provided device-to-rank mapping; BUT TiledArray found ") +
             std::to_string(num_devices) +
-            " HIP devices; only 1 HIP device / MPI process is supported");
+            " devices; only 1 device / MPI process is supported");
       }
       device_id = 0;
     } else {  // not enough devices + devices are not pre-mapped
       throw std::runtime_error(
           std::string("TiledArray found ") + std::to_string(mpi_local_size) +
           " MPI ranks on a node with " + std::to_string(num_devices) +
-          " HIP devices; only 1 MPI process / HIP device model is currently "
+          " devices; only 1 MPI process / device model is currently "
           "supported");
     }
   }
@@ -134,7 +297,7 @@ inline int current_device_id(World& world) {
   return device_id;
 }
 
-inline void HIPRT_CB hip_readyflag_callback(void* userData) {
+inline void DEVICERT_CB readyflag_callback(void* userData) {
   // convert void * to std::atomic<bool>
   std::atomic<bool>* flag = static_cast<std::atomic<bool>*>(userData);
   // set the flag to be true
@@ -149,12 +312,12 @@ struct ProbeFlag {
   std::atomic<bool>* flag;
 };
 
-inline void thread_wait_stream(const hipStream_t& stream) {
+inline void thread_wait_stream(const stream_t& stream) {
   std::atomic<bool>* flag = new std::atomic<bool>(false);
 
-  HipSafeCall(hipLaunchHostFunc(stream, detail::hip_readyflag_callback, flag));
+  DeviceSafeCall(launchHostFunc(stream, readyflag_callback, flag));
 
-  detail::ProbeFlag probe(flag);
+  ProbeFlag probe(flag);
 
   // wait with sleep and do not do work
   madness::ThreadPool::await(probe, false, true);
@@ -163,40 +326,38 @@ inline void thread_wait_stream(const hipStream_t& stream) {
   delete flag;
 }
 
-}  // namespace detail
-
-inline const hipStream_t*& tls_stream_accessor() {
-  static thread_local const hipStream_t* thread_local_stream_ptr{nullptr};
+inline const stream_t*& tls_stream_accessor() {
+  static thread_local const stream_t* thread_local_stream_ptr{nullptr};
   return thread_local_stream_ptr;
 }
 
-inline void synchronize_stream(const hipStream_t* stream) {
+inline void synchronize_stream(const stream_t* stream) {
   tls_stream_accessor() = stream;
 }
 
 /**
- * hipEnv maintains the HIP-related part of the runtime environment,
- * such as HIP-specific memory allocators
+ * Env maintains the device-related part of the runtime environment,
+ * such as specialized data structures like device streams and memory allocators
  *
  * \note this is a Singleton
  */
-class hipEnv {
+class Env {
  public:
-  ~hipEnv() {
+  ~Env() {
     // destroy streams on current device
     for (auto& stream : streams_) {
-      HipSafeCallNoThrow(hipStreamDestroy(stream));
+      DeviceSafeCallNoThrow(streamDestroy(stream));
     }
   }
 
-  hipEnv(const hipEnv&) = delete;
-  hipEnv(hipEnv&&) = delete;
-  hipEnv& operator=(const hipEnv&) = delete;
-  hipEnv& operator=(hipEnv&&) = delete;
+  Env(const Env&) = delete;
+  Env(Env&&) = delete;
+  Env& operator=(const Env&) = delete;
+  Env& operator=(Env&&) = delete;
 
   /// access the singleton instance; if not initialized will be
-  /// initialized via hipEnv::initialize() with the default params
-  static std::unique_ptr<hipEnv>& instance() {
+  /// initialized via Env::initialize() with the default params
+  static std::unique_ptr<Env>& instance() {
     if (!instance_accessor()) {
       initialize();
     }
@@ -219,12 +380,12 @@ class hipEnv {
     std::scoped_lock lock{mtx};
     // only the winner of the lock race gets to initialize
     if (instance_accessor() == nullptr) {
-      int num_streams = detail::num_streams();
-      int num_devices = detail::num_devices();
-      int device_id = detail::current_device_id(world);
+      int num_streams = device::num_streams();
+      int num_devices = device::num_devices();
+      int device_id = device::current_device_id(world);
       // set device for current MPI process .. will be set in the ctor as well
-      HipSafeCall(hipSetDevice(device_id));
-      HipSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+      DeviceSafeCall(setDevice(device_id));
+      DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared));
 
       // uncomment to debug umpire ops
       //
@@ -235,7 +396,7 @@ class hipEnv {
 
       auto& rm = umpire::ResourceManager::getInstance();
 
-      auto mem_total_free = hipEnv::memory_total_and_free_device();
+      auto mem_total_free = Env::memory_total_and_free_device();
 
       // turn off Umpire introspection for non-Debug builds
 #ifndef NDEBUG
@@ -257,7 +418,8 @@ class hipEnv {
               mem_total_free.first);
       auto dev_dynamic_pool =
           rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "HIPDynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit);
+              "DEVICEDynamicPool", dev_size_limited_alloc, 0,
+              pinned_alloc_limit);
 
       // allocate pinned_alloc_limit in pinned memory
       auto pinned_size_limited_alloc =
@@ -269,10 +431,10 @@ class hipEnv {
               "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc,
               page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE);
 
-      auto hip_env = std::unique_ptr<hipEnv>(
-          new hipEnv(world, num_devices, device_id, num_streams,
-                     um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool));
-      instance_accessor() = std::move(hip_env);
+      auto env = std::unique_ptr<Env>(
+          new Env(world, num_devices, device_id, num_streams, um_dynamic_pool,
+                  dev_dynamic_pool, pinned_dynamic_pool));
+      instance_accessor() = std::move(env);
     }
   }
 
@@ -288,7 +450,7 @@ class hipEnv {
     return device_concurrent_managed_access_;
   }
 
-  size_t stream_id(const hipStream_t& stream) const {
+  size_t stream_id(const stream_t& stream) const {
     auto it = std::find(streams_.begin(), streams_.end(), stream);
     if (it == streams_.end()) abort();
     return it - streams_.begin();
@@ -297,12 +459,12 @@ class hipEnv {
   /// @return the total size of all and free device memory on the current device
   static std::pair<size_t, size_t> memory_total_and_free_device() {
     std::pair<size_t, size_t> result;
-    // N.B. hipMemGetInfo returns {free,total}
-    HipSafeCall(hipMemGetInfo(&result.second, &result.first));
+    // N.B. *MemGetInfo returns {free,total}
+    DeviceSafeCall(memGetInfo(&result.second, &result.first));
     return result;
   }
 
-  /// Collective call to probe HIP {total,free} memory
+  /// Collective call to probe device {total,free} memory
 
   /// @return the total size of all and free device memory on every rank's
   /// device
@@ -311,7 +473,7 @@ class hipEnv {
     std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
     auto rank = world_->rank();
     std::tie(total_memory.at(rank), free_memory.at(rank)) =
-        hipEnv::memory_total_and_free_device();
+        Env::memory_total_and_free_device();
     world_->gop.sum(total_memory.data(), total_memory.size());
     world_->gop.sum(free_memory.data(), free_memory.size());
     std::vector<std::pair<size_t, size_t>> result(world_size);
@@ -321,11 +483,11 @@ class hipEnv {
     return result;
   }
 
-  const hipStream_t& stream(std::size_t i) const { return streams_.at(i); }
+  const stream_t& stream(std::size_t i) const { return streams_.at(i); }
 
-  const hipStream_t& stream_h2d() const { return streams_[num_streams_]; }
+  const stream_t& stream_h2d() const { return streams_[num_streams_]; }
 
-  const hipStream_t& stream_d2h() const { return streams_[num_streams_ + 1]; }
+  const stream_t& stream_d2h() const { return streams_[num_streams_ + 1]; }
 
   /// @return a (non-thread-safe) Umpire allocator for device UM
   umpire::Allocator& um_allocator() { return um_allocator_; }
@@ -362,7 +524,7 @@ class hipEnv {
   /// @return an Umpire allocator that allocates from a
   ///         pinned memory pool
   /// @warning this is not a thread-safe allocator, should be only used when
-  ///          wrapped into umpire_allocator_impl
+  ///          wrapped into umpire_based_allocator_impl
   umpire::Allocator& pinned_allocator() { return pinned_allocator_; }
 
   // clang-format off
@@ -379,9 +541,9 @@ class hipEnv {
   }
 
  protected:
-  hipEnv(World& world, int num_devices, int device_id, int num_streams,
-         umpire::Allocator um_alloc, umpire::Allocator device_alloc,
-         umpire::Allocator pinned_alloc)
+  Env(World& world, int num_devices, int device_id, int num_streams,
+      umpire::Allocator um_alloc, umpire::Allocator device_alloc,
+      umpire::Allocator pinned_alloc)
       : world_(&world),
         um_allocator_(um_alloc),
         device_allocator_(device_alloc),
@@ -390,34 +552,38 @@ class hipEnv {
         current_device_id_(device_id),
         num_streams_(num_streams) {
     if (num_devices <= 0) {
-      throw std::runtime_error("No HIP-Enabled GPUs Found!\n");
+      throw std::runtime_error("No " TILEDARRAY_DEVICE_RUNTIME_STR
+                               " compute devices found!\n");
     }
 
     // set device for current MPI process
-    HipSafeCall(hipSetDevice(current_device_id_));
+    DeviceSafeCall(setDevice(current_device_id_));
 
-    /// check the capability of HIP device
-    hipDeviceProp prop;
-    HipSafeCall(hipGetDeviceProperties(&prop, device_id));
+    /// check the capability of device
+    deviceProp_t prop;
+    DeviceSafeCall(getDeviceProperties(&prop, device_id));
     if (!prop.managedMemory) {
-      throw std::runtime_error("HIP Device doesn't support managedMemory\n");
+      throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR
+                               "device doesn't support managedMemory\n");
     }
     int concurrent_managed_access;
-    HipSafeCall(hipDeviceGetAttribute(&concurrent_managed_access,
-                                      hipDeviceAttributeConcurrentManagedAccess,
+    DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access,
+                                      DeviceAttributeConcurrentManagedAccess,
                                       device_id));
     device_concurrent_managed_access_ = concurrent_managed_access;
     if (!device_concurrent_managed_access_) {
-      std::cout << "\nWarning: HIP Device doesn't support "
+      std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR
+                   " device doesn't support "
                    "ConcurrentManagedAccess!\n\n";
     }
 
     // creates streams on current device
     streams_.resize(num_streams_ + 2);
     for (auto& stream : streams_) {
-      HipSafeCall(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+      DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking));
     }
-    std::cout << "created " << num_streams_ << " HIP streams + 2 I/O streams"
+    std::cout << "created " << num_streams_
+              << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams + 2 I/O streams"
               << std::endl;
   }
 
@@ -431,7 +597,7 @@ class hipEnv {
   /// memory
   umpire::Allocator device_allocator_;
   // allocates from a dynamic, size-limited pinned memory pool
-  // N.B. not thread safe, so must be wrapped into umpire_allocator_impl
+  // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl
   umpire::Allocator pinned_allocator_;
 
   int num_devices_;
@@ -439,28 +605,63 @@ class hipEnv {
   bool device_concurrent_managed_access_;
 
   int num_streams_;
-  std::vector<hipStream_t> streams_;
+  std::vector<stream_t> streams_;
 
-  inline static std::unique_ptr<hipEnv>& instance_accessor() {
-    static std::unique_ptr<hipEnv> instance_{nullptr};
+  inline static std::unique_ptr<Env>& instance_accessor() {
+    static std::unique_ptr<Env> instance_{nullptr};
     return instance_;
   }
 };
 
-namespace detail {
+}  // namespace device
 
+namespace detail {
 template <typename Range>
-const hipStream_t& get_stream_based_on_range(const Range& range) {
+const device::stream_t& get_stream_based_on_range(const Range& range) {
   // TODO better way to get stream based on the id of tensor
-  auto stream_id = range.offset() % hipEnv::instance()->num_streams();
-  auto& stream = hipEnv::instance()->stream(stream_id);
+  auto stream_id = range.offset() % device::Env::instance()->num_streams();
+  auto& stream = device::Env::instance()->stream(stream_id);
   return stream;
 }
-
 }  // namespace detail
 
-}  // namespace TiledArray
+#endif  // TILEDARRAY_HAS_DEVICE
+
+#ifdef TILEDARRAY_HAS_CUDA
+namespace nvidia {
+
+// Color definitions for nvtxcalls
+enum class argbColor : uint32_t {
+  red = 0xFFFF0000,
+  blue = 0xFF0000FF,
+  green = 0xFF008000,
+  yellow = 0xFFFFFF00,
+  cyan = 0xFF00FFFF,
+  magenta = 0xFFFF00FF,
+  gray = 0xFF808080,
+  purple = 0xFF800080
+};
+
+/// enter a profiling range by calling nvtxRangePushEx
+/// \param[in] range_title a char string containing the range title
+/// \param[in] range_color the range color
+inline void range_push(const char* range_title, argbColor range_color) {
+  nvtxEventAttributes_t eventAttrib = {0};
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(range_color);
+  eventAttrib.message.ascii = range_title;
+  nvtxRangePushEx(&eventAttrib);
+}
 
-#endif  // TILEDARRAY_HAS_HIP
+/// exits the current profiling range by calling nvtxRangePopEx
+inline void range_pop() { nvtxRangePop(); }
+
+}  // namespace nvidia
+#endif  // #ifdef TILEDARRAY_HAS_DEVICE
+
+}  // namespace TiledArray
 
-#endif  // TILEDARRAY_EXTERNAL_HIP_H__INCLUDED
+#endif  // TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h
index 71508226a4..e8d0d48632 100644
--- a/src/TiledArray/external/umpire.h
+++ b/src/TiledArray/external/umpire.h
@@ -71,7 +71,7 @@ std::mutex MutexLock<Tag>::mtx_;
 /// \tparam StaticLock a type providing static `lock()` and `unlock()` methods ;
 ///         defaults to NullLock which does not lock
 template <class T, class StaticLock = detail::NullLock>
-class umpire_allocator_impl {
+class umpire_based_allocator_impl {
  public:
   using value_type = T;
   using pointer = value_type*;
@@ -89,12 +89,12 @@ class umpire_allocator_impl {
       typename std::pointer_traits<pointer>::difference_type;
   using size_type = std::make_unsigned_t<difference_type>;
 
-  umpire_allocator_impl(umpire::Allocator* umpalloc) noexcept
+  umpire_based_allocator_impl(umpire::Allocator* umpalloc) noexcept
       : umpalloc_(umpalloc) {}
 
   template <class U>
-  umpire_allocator_impl(
-      const umpire_allocator_impl<U, StaticLock>& rhs) noexcept
+  umpire_based_allocator_impl(
+      const umpire_based_allocator_impl<U, StaticLock>& rhs) noexcept
       : umpalloc_(rhs.umpalloc_) {}
 
   /// allocates memory using umpire dynamic pool
@@ -140,17 +140,19 @@ class umpire_allocator_impl {
 
  private:
   umpire::Allocator* umpalloc_;
-};  // class umpire_allocator_impl
+};  // class umpire_based_allocator_impl
 
 template <class T1, class T2, class StaticLock>
-bool operator==(const umpire_allocator_impl<T1, StaticLock>& lhs,
-                const umpire_allocator_impl<T2, StaticLock>& rhs) noexcept {
+bool operator==(
+    const umpire_based_allocator_impl<T1, StaticLock>& lhs,
+    const umpire_based_allocator_impl<T2, StaticLock>& rhs) noexcept {
   return lhs.umpire_allocator() == rhs.umpire_allocator();
 }
 
 template <class T1, class T2, class StaticLock>
-bool operator!=(const umpire_allocator_impl<T1, StaticLock>& lhs,
-                const umpire_allocator_impl<T2, StaticLock>& rhs) noexcept {
+bool operator!=(
+    const umpire_based_allocator_impl<T1, StaticLock>& lhs,
+    const umpire_based_allocator_impl<T2, StaticLock>& rhs) noexcept {
   return !(lhs == rhs);
 }
 
@@ -195,23 +197,23 @@ namespace archive {
 
 template <class Archive, class T, class StaticLock>
 struct ArchiveLoadImpl<Archive,
-                       TiledArray::umpire_allocator_impl<T, StaticLock>> {
+                       TiledArray::umpire_based_allocator_impl<T, StaticLock>> {
   static inline void load(
       const Archive& ar,
-      TiledArray::umpire_allocator_impl<T, StaticLock>& allocator) {
+      TiledArray::umpire_based_allocator_impl<T, StaticLock>& allocator) {
     std::string allocator_name;
     ar& allocator_name;
-    allocator = TiledArray::umpire_allocator_impl<T, StaticLock>(
+    allocator = TiledArray::umpire_based_allocator_impl<T, StaticLock>(
         umpire::ResourceManager::getInstance().getAllocator(allocator_name));
   }
 };
 
 template <class Archive, class T, class StaticLock>
-struct ArchiveStoreImpl<Archive,
-                        TiledArray::umpire_allocator_impl<T, StaticLock>> {
+struct ArchiveStoreImpl<
+    Archive, TiledArray::umpire_based_allocator_impl<T, StaticLock>> {
   static inline void store(
       const Archive& ar,
-      const TiledArray::umpire_allocator_impl<T, StaticLock>& allocator) {
+      const TiledArray::umpire_based_allocator_impl<T, StaticLock>& allocator) {
     ar& allocator.umpire_allocator()->getName();
   }
 };
diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h
index 6c364be113..7f411eaeba 100644
--- a/src/TiledArray/fwd.h
+++ b/src/TiledArray/fwd.h
@@ -81,13 +81,14 @@ typedef Tensor<long> TensorL;
 typedef Tensor<std::complex<double>> TensorZ;
 typedef Tensor<std::complex<float>> TensorC;
 
-// CUDA tensor
-#ifdef TILEDARRAY_HAS_CUDA
-
-class cudaEnv;
+#ifdef TILEDARRAY_HAS_DEVICE
+namespace device {
+class Env;
+}
+using deviceEnv = device::Env;
 
 template <class T, class StaticLock, typename UmpireAllocatorAccessor>
-class cuda_allocator_impl;
+class umpire_based_allocator;
 
 template <typename T, typename A = std::allocator<T>>
 class default_init_allocator;
@@ -100,32 +101,32 @@ template <typename Tag = void>
 class MutexLock;
 }  // namespace detail
 
-/// pooled thread-safe CUDA UM allocator
+/// pooled thread-safe unified memory (UM) allocator for device computing
 template <typename T>
-using cuda_um_allocator =
-    default_init_allocator<T, cuda_allocator_impl<T, detail::MutexLock<cudaEnv>,
-                                                  detail::get_um_allocator>>;
+using device_um_allocator = default_init_allocator<
+    T, umpire_based_allocator<T, detail::MutexLock<deviceEnv>,
+                              detail::get_um_allocator>>;
 
-/// pooled thread-safe CUDA-based pinned host memory allocator
+/// pooled thread-safe pinned host memory allocator for device computing
 template <typename T>
-using cuda_pinned_allocator =
-    default_init_allocator<T,
-                           cuda_allocator_impl<T, detail::MutexLock<cudaEnv>,
-                                               detail::get_pinned_allocator>>;
+using device_pinned_allocator = default_init_allocator<
+    T, umpire_based_allocator<T, detail::MutexLock<deviceEnv>,
+                              detail::get_pinned_allocator>>;
 
-/// \brief a vector that lives in CUDA Unified Memory, with most operations
+/// \brief a vector that lives in UM, with most operations
 /// implemented on the CPU
 template <typename T>
-using cuda_um_btas_varray = ::btas::varray<T, TiledArray::cuda_um_allocator<T>>;
+using device_um_btas_varray =
+    ::btas::varray<T, TiledArray::device_um_allocator<T>>;
 
 /**
- * btas::Tensor with UM storage cuda_um_btas_varray
+ * btas::Tensor with UM storage device_um_btas_varray
  */
 template <typename T, typename Range = TiledArray::Range>
 using btasUMTensorVarray =
-    ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>;
+    ::btas::Tensor<T, Range, TiledArray::device_um_btas_varray<T>>;
 
-#endif
+#endif  // TILEDARRAY_HAS_DEVICE
 
 template <typename>
 class Tile;
diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h
index dbb8f53b55..a22613fb38 100644
--- a/src/TiledArray/host/allocator.h
+++ b/src/TiledArray/host/allocator.h
@@ -39,9 +39,9 @@ namespace TiledArray {
 /// pooled, thread-safe allocator for host memory
 template <class T>
 class host_allocator_impl
-    : public umpire_allocator_impl<T, detail::MutexLock<hostEnv>> {
+    : public umpire_based_allocator_impl<T, detail::MutexLock<hostEnv>> {
  public:
-  using base_type = umpire_allocator_impl<T, detail::MutexLock<hostEnv>>;
+  using base_type = umpire_based_allocator_impl<T, detail::MutexLock<hostEnv>>;
   using typename base_type::const_pointer;
   using typename base_type::const_reference;
   using typename base_type::pointer;
@@ -53,9 +53,8 @@ class host_allocator_impl
 
   template <class U>
   host_allocator_impl(const host_allocator_impl<U>& rhs) noexcept
-      : base_type(static_cast<
-                  const umpire_allocator_impl<U, detail::MutexLock<hostEnv>>&>(
-            rhs)) {}
+      : base_type(static_cast<const umpire_based_allocator_impl<
+                      U, detail::MutexLock<hostEnv>>&>(rhs)) {}
 
   template <typename T1, typename T2>
   friend bool operator==(const host_allocator_impl<T1>& lhs,
diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h
index 5852cf6a20..1b3c4f277f 100644
--- a/src/TiledArray/host/env.h
+++ b/src/TiledArray/host/env.h
@@ -114,7 +114,7 @@ class hostEnv {
   /// @return an Umpire allocator that allocates from a
   ///         host memory pool
   /// @warning this is not a thread-safe allocator, should be only used when
-  ///          wrapped into umpire_allocator_impl
+  ///          wrapped into umpire_based_allocator_impl
   umpire::Allocator& host_allocator() { return host_allocator_; }
 
   // clang-format off
@@ -139,7 +139,7 @@ class hostEnv {
   World* world_;
 
   // allocates from a dynamic, size-limited host memory pool
-  // N.B. not thread safe, so must be wrapped into umpire_allocator_impl
+  // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl
   umpire::Allocator host_allocator_;
 
   inline static std::unique_ptr<hostEnv>& instance_accessor() {
diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h
index 753ac5df58..36cd7cf93f 100644
--- a/src/TiledArray/reduce_task.h
+++ b/src/TiledArray/reduce_task.h
@@ -24,8 +24,8 @@
 #include <TiledArray/error.h>
 #include <TiledArray/external/madness.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
+#ifdef TILEDARRAY_HAS_DEVICE
+#include <TiledArray/device/cuda_task_fn.h>
 #include <TiledArray/external/cuda.h>
 #include <TiledArray/tensor/type_traits.h>
 #include <TiledArray/util/time.h>
@@ -304,9 +304,10 @@ class ReduceTask {
 
     };  // class ReduceObject
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-    static void CUDART_CB cuda_reduceobject_delete_callback(void* userData) {
+    static void DEVICERT_CB
+    device_reduceobject_delete_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -338,11 +339,11 @@ class ReduceTask {
       world->taskq.add(destroy_vector, objects, TaskAttributes::hipri());
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<0>() +=
+      TiledArray::detail::device_callback_duration_ns<0>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB cuda_dependency_dec_callback(void* userData) {
+    static void DEVICERT_CB device_dependency_dec_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -361,12 +362,12 @@ class ReduceTask {
       //                           " call 2\n";
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<1>() +=
+      TiledArray::detail::device_callback_duration_ns<1>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB
-    cuda_dependency_dec_reduceobject_delete_callback(void* userData) {
+    static void DEVICERT_CB
+    device_dependency_dec_reduceobject_delete_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -399,11 +400,11 @@ class ReduceTask {
       delete objects;
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<2>() +=
+      TiledArray::detail::device_callback_duration_ns<2>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB cuda_readyresult_reset_callback(void* userData) {
+    static void DEVICERT_CB device_readyresult_reset_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -429,7 +430,7 @@ class ReduceTask {
       world->taskq.add(reset, objects, TaskAttributes::hipri());
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<3>() +=
+      TiledArray::detail::device_callback_duration_ns<3>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
@@ -459,8 +460,8 @@ class ReduceTask {
           op_(*result, ready_object->arg());
 
           // cleanup the argument
-#ifdef TILEDARRAY_HAS_CUDA
-          auto stream_ptr = tls_cudastream_accessor();
+#ifdef TILEDARRAY_HAS_DEVICE
+          auto stream_ptr = device::tls_stream_accessor();
 
           /// non-CUDA op
           if (stream_ptr == nullptr) {
@@ -471,12 +472,12 @@ class ReduceTask {
             (*callback_object)[0] = &world_;
             (*callback_object)[1] = this;
             (*callback_object)[2] = ready_object;
-            CudaSafeCall(
-                cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-            CudaSafeCall(cudaLaunchHostFunc(
-                *stream_ptr, cuda_dependency_dec_reduceobject_delete_callback,
+            DeviceSafeCall(
+                device::setDevice(deviceEnv::instance()->current_device_id()));
+            DeviceSafeCall(cudaLaunchHostFunc(
+                *stream_ptr, device_dependency_dec_reduceobject_delete_callback,
                 callback_object));
-            synchronize_stream(nullptr);
+            device::synchronize_stream(nullptr);
             //                std::cout << std::to_string(world().rank()) + "
             //                add 3\n";
           }
@@ -494,8 +495,8 @@ class ReduceTask {
           op_(*result, *ready_result);
 
           // cleanup the result
-#ifdef TILEDARRAY_HAS_CUDA
-          auto stream_ptr = tls_cudastream_accessor();
+#ifdef TILEDARRAY_HAS_DEVICE
+          auto stream_ptr = device::tls_stream_accessor();
           if (stream_ptr == nullptr) {
             ready_result.reset();
           } else {
@@ -504,11 +505,12 @@ class ReduceTask {
             auto callback_object = new std::vector<void*>(2);
             (*callback_object)[0] = &world_;
             (*callback_object)[1] = ready_result_heap;
-            CudaSafeCall(
-                cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-            CudaSafeCall(cudaLaunchHostFunc(
-                *stream_ptr, cuda_readyresult_reset_callback, callback_object));
-            synchronize_stream(nullptr);
+            DeviceSafeCall(
+                device::setDevice(deviceEnv::instance()->current_device_id()));
+            DeviceSafeCall(cudaLaunchHostFunc(*stream_ptr,
+                                              device_readyresult_reset_callback,
+                                              callback_object));
+            device::synchronize_stream(nullptr);
             //                std::cout << std::to_string(world().rank()) + "
             //                add 4\n";
           }
@@ -534,19 +536,19 @@ class ReduceTask {
       op_(*result, object->arg());
 
       // Cleanup the argument
-#ifdef TILEDARRAY_HAS_CUDA
-      auto stream_ptr = tls_cudastream_accessor();
+#ifdef TILEDARRAY_HAS_DEVICE
+      auto stream_ptr = device::tls_stream_accessor();
       if (stream_ptr == nullptr) {
         ReduceObject::destroy(object);
       } else {
         auto callback_object = new std::vector<void*>(2);
         (*callback_object)[0] = &world_;
         (*callback_object)[1] = const_cast<ReduceObject*>(object);
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_reduceobject_delete_callback, callback_object));
-        synchronize_stream(nullptr);
+        DeviceSafeCall(
+            device::setDevice(deviceEnv::instance()->current_device_id()));
+        DeviceSafeCall(cudaLaunchHostFunc(
+            *stream_ptr, device_reduceobject_delete_callback, callback_object));
+        device::synchronize_stream(nullptr);
         //            std::cout << std::to_string(world().rank()) + " add 1\n";
       }
 #else
@@ -557,16 +559,16 @@ class ReduceTask {
 
       // Decrement the dependency counter for the argument. This must
       // be done after the reduce call to avoid a race condition.
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
       if (stream_ptr == nullptr) {
         this->dec();
       } else {
         auto callback_object2 = new std::vector<void*>(1);
         (*callback_object2)[0] = this;
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_dependency_dec_callback, callback_object2));
+        DeviceSafeCall(
+            device::setDevice(deviceEnv::instance()->current_device_id()));
+        DeviceSafeCall(device::launchHostFunc(
+            *stream_ptr, device_dependency_dec_callback, callback_object2));
         //            std::cout << std::to_string(world().rank()) + " add 2\n";
       }
 #else
@@ -585,8 +587,8 @@ class ReduceTask {
       op_(*result, object2->arg());
 
       // Cleanup arguments
-#ifdef TILEDARRAY_HAS_CUDA
-      auto stream_ptr = tls_cudastream_accessor();
+#ifdef TILEDARRAY_HAS_DEVICE
+      auto stream_ptr = device::tls_stream_accessor();
       if (stream_ptr == nullptr) {
         ReduceObject::destroy(object1);
         ReduceObject::destroy(object2);
@@ -595,11 +597,12 @@ class ReduceTask {
         (*callback_object1)[0] = &world_;
         (*callback_object1)[1] = const_cast<ReduceObject*>(object1);
         (*callback_object1)[2] = const_cast<ReduceObject*>(object2);
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_reduceobject_delete_callback, callback_object1));
-        synchronize_stream(nullptr);
+        DeviceSafeCall(
+            device::setDevice(deviceEnv::instance()->current_device_id()));
+        DeviceSafeCall(cudaLaunchHostFunc(*stream_ptr,
+                                          device_reduceobject_delete_callback,
+                                          callback_object1));
+        device::synchronize_stream(nullptr);
         //            std::cout << std::to_string(world().rank()) + " add 1\n";
       }
 #else
@@ -612,7 +615,7 @@ class ReduceTask {
 
       // Decrement the dependency counter for the two arguments. This
       // must be done after the reduce call to avoid a race condition.
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
       if (stream_ptr == nullptr) {
         this->dec();
         this->dec();
@@ -620,10 +623,10 @@ class ReduceTask {
         auto callback_object2 = new std::vector<void*>(2);
         (*callback_object2)[0] = this;
         (*callback_object2)[1] = this;
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_dependency_dec_callback, callback_object2));
+        DeviceSafeCall(
+            device::setDevice(deviceEnv::instance()->current_device_id()));
+        DeviceSafeCall(cudaLaunchHostFunc(
+            *stream_ptr, device_dependency_dec_callback, callback_object2));
         //            std::cout << std::to_string(world().rank()) + " add 2\n";
       }
 
@@ -633,13 +636,13 @@ class ReduceTask {
 #endif
     }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
     template <typename Result = result_type>
-    std::enable_if_t<detail::is_cuda_tile_v<Result>, void> internal_run(
+    std::enable_if_t<detail::is_device_tile_v<Result>, void> internal_run(
         const madness::TaskThreadEnv&) {
       TA_ASSERT(ready_result_);
 
-      auto post_result = madness::add_cuda_task(world_, op_, *ready_result_);
+      auto post_result = madness::add_device_task(world_, op_, *ready_result_);
       result_.set(post_result);
 
       if (callback_) {
@@ -648,7 +651,7 @@ class ReduceTask {
     }
 
     template <typename Result = result_type>
-    std::enable_if_t<!detail::is_cuda_tile_v<Result>, void>
+    std::enable_if_t<!detail::is_device_tile_v<Result>, void>
 #else
     void
 #endif
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index 2903e5e7f7..413b784d22 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -317,17 +317,17 @@ constexpr const bool is_reduce_op_v =
 /// detect cuda tile
 #ifdef TILEDARRAY_HAS_CUDA
 template <typename T>
-struct is_cuda_tile : public std::false_type {};
+struct is_device_tile : public std::false_type {};
 
 template <typename T>
-struct is_cuda_tile<Tile<T>> : public is_cuda_tile<T> {};
+struct is_device_tile<Tile<T>> : public is_device_tile<T> {};
 
 template <typename T, typename Op>
-struct is_cuda_tile<LazyArrayTile<T, Op>>
-    : public is_cuda_tile<typename LazyArrayTile<T, Op>::eval_type> {};
+struct is_device_tile<LazyArrayTile<T, Op>>
+    : public is_device_tile<typename LazyArrayTile<T, Op>::eval_type> {};
 
 template <typename T>
-static constexpr const auto is_cuda_tile_v = is_cuda_tile<T>::value;
+static constexpr const auto is_device_tile_v = is_device_tile<T>::value;
 
 #endif
 
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 47ccc00d8e..51cfc02825 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -6,9 +6,11 @@
 
 #include <madness/world/safempi.h>
 
+#ifdef TILEDARRAY_HAS_DEVICE
 #ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cublas.h>
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/device/cublas.h>
+#endif
+#include <TiledArray/external/device.h>
 #include <librett.h>
 #endif
 
@@ -22,28 +24,32 @@
 namespace TiledArray {
 namespace {
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 /// initialize cuda environment
-inline void cuda_initialize() {
-  /// initialize cudaGlobal
-  cudaEnv::instance();
-  //
+inline void device_initialize() {
+  /// initialize deviceEnv
+  deviceEnv::instance();
+#if defined(TILEDARRAY_HAS_CUDA)
   cuBLASHandlePool::handle();
+#endif
   // initialize LibreTT
   librettInitialize();
 }
 
 /// finalize cuda environment
-inline void cuda_finalize() {
-  CudaSafeCall(cudaDeviceSynchronize());
+inline void device_finalize() {
+  DeviceSafeCall(device::deviceSynchronize());
   librettFinalize();
+#if defined(TILEDARRAY_HAS_CUDA)
   cublasDestroy(cuBLASHandlePool::handle());
   delete &cuBLASHandlePool::handle();
-  // although TA::cudaEnv is a singleton, must explicitly delete it so
-  // that CUDA runtime is not finalized before the cudaEnv dtor is called
-  cudaEnv::instance().reset(nullptr);
-}
 #endif
+  // although TA::deviceEnv is a singleton, must explicitly delete it so
+  // that the device runtime is not finalized before the deviceEnv dtor is
+  // called
+  deviceEnv::instance().reset(nullptr);
+}
+#endif  // TILEDARRAY_HAS_DEVICE
 
 inline bool& initialized_madworld_accessor() {
   static bool flag = false;
@@ -102,8 +108,8 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
                               ? madness::initialize(argc, argv, comm, quiet)
                               : *madness::World::find_instance(comm);
     TiledArray::set_default_world(default_world);
-#ifdef TILEDARRAY_HAS_CUDA
-    TiledArray::cuda_initialize();
+#ifdef TILEDARRAY_HAS_DEVICE
+    TiledArray::device_initialize();
 #endif
     TiledArray::max_threads = TiledArray::get_num_threads();
     TiledArray::set_num_threads(1);
@@ -164,8 +170,8 @@ void TiledArray::finalize() {
   TiledArray::set_num_threads(TiledArray::max_threads);
   TiledArray::get_default_world().gop.fence();  // this should ensure no pending
                                                 // tasks using cuda allocators
-#ifdef TILEDARRAY_HAS_CUDA
-  TiledArray::cuda_finalize();
+#ifdef TILEDARRAY_HAS_DEVICE
+  TiledArray::device_finalize();
 #endif
   if (initialized_madworld()) {
     madness::finalize();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 217c522018..d17dd8345b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -102,8 +102,12 @@ set(ta_test_src_files  ta_test.cpp
     cp.cpp
 )
 
+if(CUDA_FOUND OR HIP_FOUND)
+  list(APPEND ta_test_src_files librett.cpp)
+endif()
+
 if(CUDA_FOUND)
-    list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp)
+    list(APPEND ta_test_src_files expressions_cuda_um.cpp tensor_um.cpp)
 endif()
 
 # if using C++20 must use Boost 1.74 or later:
diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index a17b749789..29408c27c2 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -27,7 +27,7 @@
 
 #ifdef TILEDARRAY_HAS_CUDA
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <range_fixture.h>
 #include <tiledarray.h>
 #include "unit_test_config.h"
@@ -76,8 +76,8 @@ struct UMExpressionsFixture : public TiledRangeFixture {
 
   static UMTensor permute_fn(const madness::Future<UMTensor>& tensor_f,
                              const Permutation& perm) {
-    return madness::add_cuda_task(*GlobalFixture::world, permute_task, tensor_f,
-                                  perm)
+    return madness::add_device_task(*GlobalFixture::world, permute_task,
+                                    tensor_f, perm)
         .get();
   }
 
diff --git a/tests/librett.cpp b/tests/librett.cpp
index 3785071071..ced23239fd 100644
--- a/tests/librett.cpp
+++ b/tests/librett.cpp
@@ -22,9 +22,9 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include "unit_test_config.h"
 
 struct LibreTTFixture {
@@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   TiledArray::permutation_to_col_major(perm);
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   status =
@@ -118,7 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice);
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -177,7 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   std::vector<int> extent3{int(A), int(B), int(C)};
@@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   std::vector<int> extent({A, B, C});
@@ -297,7 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   }
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   std::vector<int> extent({A, A});
@@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   }
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -397,7 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   }
 
   librettHandle plan;
-  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
+  auto stream = TiledArray::deviceEnv::instance()->stream(0);
   librettResult status;
 
   // b(k,i,j) = a(i,j,k)
diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp
index 33efbfd7d4..d860b7c813 100644
--- a/tests/tensor_um.cpp
+++ b/tests/tensor_um.cpp
@@ -18,7 +18,7 @@
  *  Chong Peng on 9/19/18.
  */
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include "global_fixture.h"
 #include "unit_test_config.h"