Merge branch 'master' into cplx_benchmarks_fix

codeplaysoftware · Jan 3, 2024 · 4f1a5ae · 4f1a5ae
2 parents a02ce2e + 2ea49db
commit 4f1a5ae
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 87 deletions.
diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h
@@ -80,11 +80,6 @@ class Transpose {
   static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
   static constexpr const index_t inner_tile_count_ =
       Tile_size / inner_tile_size_;
-  // Minimum number of Tile-mutliple rows & columns to cover the matrices
-  index_t M_pad_;
-  index_t N_pad_;
-  // Total size of Tile-mutliple covering matrix
-  index_t size_pad_;
   // Batch size when using batched transpose
   index_t batch_size_;
   // Number of contiguous elements to be used in local memory to avoid bank
@@ -115,9 +110,6 @@ class Transpose {
         stride_a_(stride_a),
         stride_at_(stride_at),
         inc_at_(inc_at),
-        M_pad_(tile_count_m_ * Tile_size),
-        N_pad_(tile_count_n_ * Tile_size),
-        size_pad_(M_pad_ * N_pad_),
         batch_size_(batch_size) {}
 
   index_t get_size() const;
@@ -209,11 +201,6 @@ class TransposeAdd {
   static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
   static constexpr const index_t inner_tile_count_ =
       Tile_size / inner_tile_size_;
-  // Minimum number of Tile-mutliple rows & columns to cover the output matrix
-  index_t M_pad_;
-  index_t N_pad_;
-  // Total size of Tile-mutliple covering matrix
-  index_t size_pad_;
   // Batch size when using batched transpose
   index_t batch_size_;
   // Number of contiguous elements to be used in local memory to avoid bank
@@ -246,9 +233,6 @@ class TransposeAdd {
         tile_count_m_((M_ - 1) / Tile_size + 1),
         tile_count_n_((N_ - 1) / Tile_size + 1),
         tile_count_total_(tile_count_m_ * tile_count_n_),
-        M_pad_(tile_count_m_ * Tile_size),
-        N_pad_(tile_count_n_ * Tile_size),
-        size_pad_(M_pad_ * N_pad_),
         batch_size_(batch_size) {}
 
   index_t get_size() const;

diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp
@@ -55,7 +55,7 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
           element_t>::get_size() const {
   // Smallest TileSize square-multiple containing input/output matrices times
   // batch_size
-  return (size_pad_ * batch_size_);
+  return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
 }
 
 template <bool in_place, int Tile_size, int wg_size, int cl_size,
@@ -254,7 +254,7 @@ PORTBLAS_INLINE typename in1_t::index_t
 TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
              in2_t, out_t, element_t>::get_size() const {
   // Smallest TileSize square-multiple containing input/output matrices
-  return (size_pad_ * batch_size_);
+  return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
 }
 
 template <bool both_trans, int Tile_size, int wg_size, int cl_size,
@@ -276,10 +276,10 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
  * @param in_a_idx [output] the input A global-memory index
  * @param in_b_idx [output] the input B global-memory index
  * @param out_idx [output] the output C global-memory index
- * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B &
- *C otherwise -> [0,M_])
- * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B &
- *C otherwise -> [0,N_])
+ * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B
+ *& C otherwise -> [0,M_])
+ * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B
+ *& C otherwise -> [0,N_])
  */
 template <bool both_trans, int Tile_size, int wg_size, int cl_size,
           bool local_memory, typename in1_t, typename in2_t, typename out_t,
@@ -461,7 +461,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
       // Compute & Copy sum/scaled input to local memory (before transpose)
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (j_block_start + jl + l * inner_tile_size_ < M_) {
-          // Compute & Copy sum/scaled input to local memory (before transpose)
+          // Compute & Copy sum/scaled input to local memory (before
+          // transpose)
           local[in_local_id +
                 l * (get_non_bank_conflict_line_size() + 1) *
                     (inner_tile_size_ / get_num_tiles_per_line())] =
@@ -490,7 +491,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
     if (j_block_start + il < N_) {
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (i_block_start + jl + l * inner_tile_size_ < M_) {
-          // Compute & Copy sum/scaled input to local memory (before transpose)
+          // Compute & Copy sum/scaled input to local memory (before
+          // transpose)
           local[in_local_id +
                 l * (get_non_bank_conflict_line_size() + 1) *
                     (inner_tile_size_ / get_num_tiles_per_line())] =
@@ -501,8 +503,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
 
     id.barrier(cl::sycl::access::fence_space::local_space);
 
-    // Transposed copy of previous output from local memory and scaled addition
-    // with 2nd non transposed matrix B
+    // Transposed copy of previous output from local memory and scaled
+    // addition with 2nd non transposed matrix B
     if (i_block_start + il < M_) {
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (j_block_start + jl + l * inner_tile_size_ < N_) {

diff --git a/test/unittest/extension/omatadd_batched_test.cpp b/test/unittest/extension/omatadd_batched_test.cpp
@@ -160,15 +160,15 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),  // trans_b
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(2.5),            // alpha
-                       ::testing::Values<scalar_t>(3.5),            // beta
+                       ::testing::Values<scalar_t>(2.5),          // alpha
+                       ::testing::Values<scalar_t>(3.5),          // beta
                        ::testing::Values<index_t>(1, 2),          // lda_mul
                        ::testing::Values<index_t>(1, 2),          // ldb_mul
-                       ::testing::Values<index_t>(1, 2, 3),       // ldc_mul
+                       ::testing::Values<index_t>(3),             // ldc_mul
                        ::testing::Values<index_t>(1, 3),          // stride_a_m
                        ::testing::Values<index_t>(1, 3),          // stride_b_m
                        ::testing::Values<index_t>(1, 3),          // stride_c_m
-                       ::testing::Values<index_t>(2, 3));         // batch_size
+                       ::testing::Values<index_t>(3));            // batch_size
 #endif
 
 template <class T>

diff --git a/test/unittest/extension/omatadd_test.cpp b/test/unittest/extension/omatadd_test.cpp
@@ -27,8 +27,8 @@
 #include "extension_reference.hpp"
 
 template <typename scalar_t>
-using combination_t = std::tuple<std::string, char, char, index_t, index_t, scalar_t,
-                                 scalar_t, index_t, index_t, index_t>;
+using combination_t = std::tuple<std::string, char, char, index_t, index_t,
+                                 scalar_t, scalar_t, index_t, index_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
@@ -37,8 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
   scalar_t alpha, beta;
 
-  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
-      combi;
+  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
+           ld_c_mul) = combi;
 
   auto q = make_queue();
   blas::SB_Handle sb_handle(q);
@@ -70,12 +70,16 @@ void run_test(const combination_t<scalar_t> combi) {
   auto m_b_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_b, q);
   auto m_c_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_c, q);
 
-  auto copy_m_a = helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
-  auto copy_m_b = helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
-  auto copy_m_c = helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);
+  auto copy_m_a =
+      helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
+  auto copy_m_b =
+      helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
+  auto copy_m_c =
+      helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);
 
-  auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta,
-                 m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c});
+  auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha,
+                                      m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu,
+                                      ldc, {copy_m_a, copy_m_b, copy_m_c});
   sb_handle.wait(omatadd_event);
 
   auto event = blas::helper::copy_to_host<scalar_t>(
@@ -98,8 +102,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
   scalar_t alpha, beta;
 
-  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
-      combi;
+  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
+           ld_c_mul) = combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -127,9 +131,9 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans_a
-                       ::testing::Values<char>('n', 't'),         // trans_b
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans_a
+                       ::testing::Values<char>('n', 't'),  // trans_b
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<scalar_t>(0, 1, 2),      // alpha
@@ -146,8 +150,8 @@ static std::string generate_name(
   char trans_a, trans_b;
   index_t m, n, lda_mul, ldb_mul, ldc_mul;
   T alpha, beta;
-  BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul,
-                     ldb_mul, ldc_mul);
+  BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta,
+                     lda_mul, ldb_mul, ldc_mul);
 }
 
 BLAS_REGISTER_TEST_ALL(OmatAdd, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp
@@ -27,8 +27,8 @@
 #include "extension_reference.hpp"
 
 template <typename scalar_t>
-using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t, index_t,
-                                 index_t, index_t, index_t>;
+using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t,
+                                 index_t, index_t, index_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
@@ -37,7 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   scalar_t alpha;
 
-  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
+  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
+      combi;
 
   // Leading dimensions are computed as multiples of the minimum value specified
   // in the oneMKL documentation at :
@@ -77,8 +78,9 @@ void run_test(const combination_t<scalar_t> combi) {
   auto copy_out =
       helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, m_b_size);
 
-  auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
-                   matrix_out, ld_out, inc_out, {copy_in, copy_out});
+  auto omatcopy2_event =
+      blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
+                       matrix_out, ld_out, inc_out, {copy_in, copy_out});
 
   sb_handle.wait(omatcopy2_event);
 
@@ -101,7 +103,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   scalar_t alpha;
 
-  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
+  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
+      combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -129,8 +132,8 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<scalar_t>(0, 2),         // alpha
@@ -147,8 +150,8 @@ static std::string generate_name(
   char trans;
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   T alpha;
-  BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out,
-                     ld_out_m);
+  BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m,
+                     inc_out, ld_out_m);
 }
 
 BLAS_REGISTER_TEST_ALL(OmatCopy2, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp
@@ -121,7 +121,7 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),              // trans
                        ::testing::Values<index_t>(1024, 4050, 16380),  // m
                        ::testing::Values<index_t>(1024, 4050, 16380),  // n
-                       ::testing::Values<scalar_t>(0, 1.05, -20.01),   // alpha
+                       ::testing::Values<scalar_t>(1.05, -20.01),      // alpha
                        ::testing::Values<index_t>(3, 5),     // ld_in_m
                        ::testing::Values<index_t>(3, 5),     // ld_out_m
                        ::testing::Values<index_t>(5, 10),    // stride_in_m
@@ -134,12 +134,12 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),         // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(0, 2.5),         // alpha
-                       ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
-                       ::testing::Values<index_t>(1, 2, 3),       // ld_out_m
+                       ::testing::Values<scalar_t>(2.5),          // alpha
+                       ::testing::Values<index_t>(1, 3),          // ld_in_m
+                       ::testing::Values<index_t>(1, 3),          // ld_out_m
                        ::testing::Values<index_t>(1, 3),          // stride_in_m
-                       ::testing::Values<index_t>(1, 3),      // stride_out_m
-                       ::testing::Values<index_t>(1, 2, 5));  // batch_size
+                       ::testing::Values<index_t>(1, 3),   // stride_out_m
+                       ::testing::Values<index_t>(1, 5));  // batch_size
 #endif
 
 template <class T>

diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp
@@ -69,8 +69,9 @@ void run_test(const combination_t<scalar_t> combi) {
   auto copy_out =
       helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, size_b);
 
-  auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out,
-                  ld_out, {copy_in, copy_out});
+  auto omatcopy_event =
+      blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in,
+                      matrix_out, ld_out, {copy_in, copy_out});
 
   sb_handle.wait(omatcopy_event);
 
@@ -106,7 +107,6 @@ void run_test(const combination_t<scalar_t> combi) {
   }
 }
 
-
 #ifdef STRESS_TESTING
 template <typename scalar_t>
 const auto combi =
@@ -120,11 +120,11 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(0, 1, 2),   // alpha
+                       ::testing::Values<scalar_t>(0, 1, 2),      // alpha
                        ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
                        ::testing::Values<index_t>(1, 2, 3));      // ld_out_m
 #endif