Skip to content

Commit

Permalink
Merge branch 'master' into cplx_benchmarks_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
muhammad-tanvir-1211 authored Jan 3, 2024
2 parents a02ce2e + 2ea49db commit 4f1a5ae
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 87 deletions.
16 changes: 0 additions & 16 deletions include/operations/extension/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,6 @@ class Transpose {
static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
static constexpr const index_t inner_tile_count_ =
Tile_size / inner_tile_size_;
// Minimum number of Tile-mutliple rows & columns to cover the matrices
index_t M_pad_;
index_t N_pad_;
// Total size of Tile-mutliple covering matrix
index_t size_pad_;
// Batch size when using batched transpose
index_t batch_size_;
// Number of contiguous elements to be used in local memory to avoid bank
Expand Down Expand Up @@ -115,9 +110,6 @@ class Transpose {
stride_a_(stride_a),
stride_at_(stride_at),
inc_at_(inc_at),
M_pad_(tile_count_m_ * Tile_size),
N_pad_(tile_count_n_ * Tile_size),
size_pad_(M_pad_ * N_pad_),
batch_size_(batch_size) {}

index_t get_size() const;
Expand Down Expand Up @@ -209,11 +201,6 @@ class TransposeAdd {
static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
static constexpr const index_t inner_tile_count_ =
Tile_size / inner_tile_size_;
// Minimum number of Tile-mutliple rows & columns to cover the output matrix
index_t M_pad_;
index_t N_pad_;
// Total size of Tile-mutliple covering matrix
index_t size_pad_;
// Batch size when using batched transpose
index_t batch_size_;
// Number of contiguous elements to be used in local memory to avoid bank
Expand Down Expand Up @@ -246,9 +233,6 @@ class TransposeAdd {
tile_count_m_((M_ - 1) / Tile_size + 1),
tile_count_n_((N_ - 1) / Tile_size + 1),
tile_count_total_(tile_count_m_ * tile_count_n_),
M_pad_(tile_count_m_ * Tile_size),
N_pad_(tile_count_n_ * Tile_size),
size_pad_(M_pad_ * N_pad_),
batch_size_(batch_size) {}

index_t get_size() const;
Expand Down
22 changes: 12 additions & 10 deletions src/operations/extension/transpose.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
element_t>::get_size() const {
// Smallest TileSize square-multiple containing input/output matrices times
// batch_size
return (size_pad_ * batch_size_);
return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
}

template <bool in_place, int Tile_size, int wg_size, int cl_size,
Expand Down Expand Up @@ -254,7 +254,7 @@ PORTBLAS_INLINE typename in1_t::index_t
TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
in2_t, out_t, element_t>::get_size() const {
// Smallest TileSize square-multiple containing input/output matrices
return (size_pad_ * batch_size_);
return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
}

template <bool both_trans, int Tile_size, int wg_size, int cl_size,
Expand All @@ -276,10 +276,10 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
* @param in_a_idx [output] the input A global-memory index
* @param in_b_idx [output] the input B global-memory index
* @param out_idx [output] the output C global-memory index
* @param i [output] the global row-index (A & B when both_trans -> [0,N_], B &
*C otherwise -> [0,M_])
* @param j [output] the global col-index (A & B when both_trans -> [0,M_], B &
*C otherwise -> [0,N_])
* @param i [output] the global row-index (A & B when both_trans -> [0,N_], B
*& C otherwise -> [0,M_])
* @param j [output] the global col-index (A & B when both_trans -> [0,M_], B
*& C otherwise -> [0,N_])
*/
template <bool both_trans, int Tile_size, int wg_size, int cl_size,
bool local_memory, typename in1_t, typename in2_t, typename out_t,
Expand Down Expand Up @@ -461,7 +461,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
// Compute & Copy sum/scaled input to local memory (before transpose)
for (index_t l = 0; l < inner_tile_count_; l++) {
if (j_block_start + jl + l * inner_tile_size_ < M_) {
// Compute & Copy sum/scaled input to local memory (before transpose)
// Compute & Copy sum/scaled input to local memory (before
// transpose)
local[in_local_id +
l * (get_non_bank_conflict_line_size() + 1) *
(inner_tile_size_ / get_num_tiles_per_line())] =
Expand Down Expand Up @@ -490,7 +491,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
if (j_block_start + il < N_) {
for (index_t l = 0; l < inner_tile_count_; l++) {
if (i_block_start + jl + l * inner_tile_size_ < M_) {
// Compute & Copy sum/scaled input to local memory (before transpose)
// Compute & Copy sum/scaled input to local memory (before
// transpose)
local[in_local_id +
l * (get_non_bank_conflict_line_size() + 1) *
(inner_tile_size_ / get_num_tiles_per_line())] =
Expand All @@ -501,8 +503,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,

id.barrier(cl::sycl::access::fence_space::local_space);

// Transposed copy of previous output from local memory and scaled addition
// with 2nd non transposed matrix B
// Transposed copy of previous output from local memory and scaled
// addition with 2nd non transposed matrix B
if (i_block_start + il < M_) {
for (index_t l = 0; l < inner_tile_count_; l++) {
if (j_block_start + jl + l * inner_tile_size_ < N_) {
Expand Down
8 changes: 4 additions & 4 deletions test/unittest/extension/omatadd_batched_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,15 @@ const auto combi =
::testing::Values<char>('n', 't'), // trans_b
::testing::Values<index_t>(64, 129, 255), // m
::testing::Values<index_t>(64, 129, 255), // n
::testing::Values<scalar_t>(2.5), // alpha
::testing::Values<scalar_t>(3.5), // beta
::testing::Values<scalar_t>(2.5), // alpha
::testing::Values<scalar_t>(3.5), // beta
::testing::Values<index_t>(1, 2), // lda_mul
::testing::Values<index_t>(1, 2), // ldb_mul
::testing::Values<index_t>(1, 2, 3), // ldc_mul
::testing::Values<index_t>(3), // ldc_mul
::testing::Values<index_t>(1, 3), // stride_a_m
::testing::Values<index_t>(1, 3), // stride_b_m
::testing::Values<index_t>(1, 3), // stride_c_m
::testing::Values<index_t>(2, 3)); // batch_size
::testing::Values<index_t>(3)); // batch_size
#endif

template <class T>
Expand Down
36 changes: 20 additions & 16 deletions test/unittest/extension/omatadd_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
#include "extension_reference.hpp"

template <typename scalar_t>
using combination_t = std::tuple<std::string, char, char, index_t, index_t, scalar_t,
scalar_t, index_t, index_t, index_t>;
using combination_t = std::tuple<std::string, char, char, index_t, index_t,
scalar_t, scalar_t, index_t, index_t, index_t>;

template <typename scalar_t, helper::AllocType mem_alloc>
void run_test(const combination_t<scalar_t> combi) {
Expand All @@ -37,8 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
scalar_t alpha, beta;

std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
combi;
std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
ld_c_mul) = combi;

auto q = make_queue();
blas::SB_Handle sb_handle(q);
Expand Down Expand Up @@ -70,12 +70,16 @@ void run_test(const combination_t<scalar_t> combi) {
auto m_b_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_b, q);
auto m_c_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_c, q);

auto copy_m_a = helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
auto copy_m_b = helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
auto copy_m_c = helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);
auto copy_m_a =
helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
auto copy_m_b =
helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
auto copy_m_c =
helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);

auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta,
m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c});
auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha,
m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu,
ldc, {copy_m_a, copy_m_b, copy_m_c});
sb_handle.wait(omatadd_event);

auto event = blas::helper::copy_to_host<scalar_t>(
Expand All @@ -98,8 +102,8 @@ void run_test(const combination_t<scalar_t> combi) {
index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
scalar_t alpha, beta;

std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
combi;
std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
ld_c_mul) = combi;

if (alloc == "usm") {
#ifdef SB_ENABLE_USM
Expand Down Expand Up @@ -127,9 +131,9 @@ const auto combi =
#else
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans_a
::testing::Values<char>('n', 't'), // trans_b
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans_a
::testing::Values<char>('n', 't'), // trans_b
::testing::Values<index_t>(64, 129, 255), // m
::testing::Values<index_t>(64, 129, 255), // n
::testing::Values<scalar_t>(0, 1, 2), // alpha
Expand All @@ -146,8 +150,8 @@ static std::string generate_name(
char trans_a, trans_b;
index_t m, n, lda_mul, ldb_mul, ldc_mul;
T alpha, beta;
BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul,
ldb_mul, ldc_mul);
BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta,
lda_mul, ldb_mul, ldc_mul);
}

BLAS_REGISTER_TEST_ALL(OmatAdd, combination_t, combi, generate_name);
23 changes: 13 additions & 10 deletions test/unittest/extension/omatcopy2_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
#include "extension_reference.hpp"

template <typename scalar_t>
using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t, index_t,
index_t, index_t, index_t>;
using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t,
index_t, index_t, index_t, index_t>;

template <typename scalar_t, helper::AllocType mem_alloc>
void run_test(const combination_t<scalar_t> combi) {
Expand All @@ -37,7 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
scalar_t alpha;

std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
combi;

// Leading dimensions are computed as multiples of the minimum value specified
// in the oneMKL documentation at :
Expand Down Expand Up @@ -77,8 +78,9 @@ void run_test(const combination_t<scalar_t> combi) {
auto copy_out =
helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, m_b_size);

auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
matrix_out, ld_out, inc_out, {copy_in, copy_out});
auto omatcopy2_event =
blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
matrix_out, ld_out, inc_out, {copy_in, copy_out});

sb_handle.wait(omatcopy2_event);

Expand All @@ -101,7 +103,8 @@ void run_test(const combination_t<scalar_t> combi) {
index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
scalar_t alpha;

std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
combi;

if (alloc == "usm") {
#ifdef SB_ENABLE_USM
Expand Down Expand Up @@ -129,8 +132,8 @@ const auto combi =
#else
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans
::testing::Values<index_t>(64, 129, 255), // m
::testing::Values<index_t>(64, 129, 255), // n
::testing::Values<scalar_t>(0, 2), // alpha
Expand All @@ -147,8 +150,8 @@ static std::string generate_name(
char trans;
index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
T alpha;
BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out,
ld_out_m);
BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m,
inc_out, ld_out_m);
}

BLAS_REGISTER_TEST_ALL(OmatCopy2, combination_t, combi, generate_name);
12 changes: 6 additions & 6 deletions test/unittest/extension/omatcopy_batched_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ const auto combi =
::testing::Values<char>('n', 't'), // trans
::testing::Values<index_t>(1024, 4050, 16380), // m
::testing::Values<index_t>(1024, 4050, 16380), // n
::testing::Values<scalar_t>(0, 1.05, -20.01), // alpha
::testing::Values<scalar_t>(1.05, -20.01), // alpha
::testing::Values<index_t>(3, 5), // ld_in_m
::testing::Values<index_t>(3, 5), // ld_out_m
::testing::Values<index_t>(5, 10), // stride_in_m
Expand All @@ -134,12 +134,12 @@ const auto combi =
::testing::Values<char>('n', 't'), // trans
::testing::Values<index_t>(64, 129, 255), // m
::testing::Values<index_t>(64, 129, 255), // n
::testing::Values<scalar_t>(0, 2.5), // alpha
::testing::Values<index_t>(1, 2, 3), // ld_in_m
::testing::Values<index_t>(1, 2, 3), // ld_out_m
::testing::Values<scalar_t>(2.5), // alpha
::testing::Values<index_t>(1, 3), // ld_in_m
::testing::Values<index_t>(1, 3), // ld_out_m
::testing::Values<index_t>(1, 3), // stride_in_m
::testing::Values<index_t>(1, 3), // stride_out_m
::testing::Values<index_t>(1, 2, 5)); // batch_size
::testing::Values<index_t>(1, 3), // stride_out_m
::testing::Values<index_t>(1, 5)); // batch_size
#endif

template <class T>
Expand Down
12 changes: 6 additions & 6 deletions test/unittest/extension/omatcopy_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@ void run_test(const combination_t<scalar_t> combi) {
auto copy_out =
helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, size_b);

auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out,
ld_out, {copy_in, copy_out});
auto omatcopy_event =
blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in,
matrix_out, ld_out, {copy_in, copy_out});

sb_handle.wait(omatcopy_event);

Expand Down Expand Up @@ -106,7 +107,6 @@ void run_test(const combination_t<scalar_t> combi) {
}
}


#ifdef STRESS_TESTING
template <typename scalar_t>
const auto combi =
Expand All @@ -120,11 +120,11 @@ const auto combi =
#else
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values<char>('n', 't'), // trans
::testing::Values<index_t>(64, 129, 255), // m
::testing::Values<index_t>(64, 129, 255), // n
::testing::Values<scalar_t>(0, 1, 2), // alpha
::testing::Values<scalar_t>(0, 1, 2), // alpha
::testing::Values<index_t>(1, 2, 3), // ld_in_m
::testing::Values<index_t>(1, 2, 3)); // ld_out_m
#endif
Expand Down
Loading

0 comments on commit 4f1a5ae

Please sign in to comment.