Skip to content

Commit

Permalink
Fixes for SPR/SPR2 BLAS 2 routines
Browse files Browse the repository at this point in the history
  • Loading branch information
pgorlani committed Jan 16, 2024
1 parent 89bc647 commit 46d393c
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 112 deletions.
93 changes: 28 additions & 65 deletions include/operations/blas2_trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -580,93 +580,56 @@ GerCol<Single, Lower, Diag, Upper, lhs_t, rhs_1_t, rhs_2_t> make_ger_col(
lhs_, scalar_, rhs_1_, rhs_2_, nWG_row_, nWG_col_, local_memory_size_);
}

/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/
/* This class performs rank 1/2 update for symmetric packed matrices. For more
* details on matrix refer to the explanation here:
* https://spec.oneapi.io/versions/1.1-rev-1/elements/oneMKL/source/domains/matrix-storage.html#matrix-storage
/**
* @struct Spr
* @brief Tree node representing a rank 1/2 update for symmetric packed
* matrices, i.e.,
*
* Spr : lhs_ = alpha_ * rhs_1_ * rhs_2_' + lhs_
* Spr2: lhs_ = alpha_ * rhs_1_ * rhs_2_' + alpha_ * rhs_2_ * rhs_1_' + lhs_
*
*
* @tparam Single true for SPR, false for SPR2
* @tparam isUpper specifies whether the triangular input matrix is upper
* @param alpha_ scaling factor for vector multiplication
* @param N_ matrix size
* @param lhs_ input/output matrix
* @param rhs_1_ input vector
* @param rhs_2_ input vector
*/
template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
struct Spr {
using value_t = typename rhs_1_t::value_t;
using index_t = typename rhs_1_t::index_t;

value_t alpha_;
index_t N_;
lhs_t lhs_;
rhs_1_t rhs_1_;
rhs_2_t rhs_2_;
value_t alpha_;
index_t N_, incX_1_, incX_2_;
// cl::sycl::sqrt(float) gives incorrect results when
// the operand becomes big. The sqrt_overflow_limit was
// identified empirically by testing the spr operator
// for matrix sizes up to 16384x16384 on the integrated
// Intel GPU. To make the experiment generic and to reduce
// the chances of failing tests on different hardware we opt
// for a more naive limit. (1048576 = 1024 * 1024)
static constexpr index_t sqrt_overflow_limit = 1048576;

Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, index_t _incX_1,
rhs_2_t &_r2, index_t _incX_2);

Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, rhs_2_t &_r2);
index_t get_size() const;
bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
value_t eval(cl::sycl::nd_item<1> ndItem);
void bind(cl::sycl::handler &h);
void adjust_access_displacement();
// Row-Col index calculation for Upper Packed Matrix
template <bool Upper>
PORTBLAS_ALWAYS_INLINE static typename std::enable_if<Upper>::type
compute_row_col(const int64_t id, const index_t size, index_t &row,
index_t &col) {
int64_t internal = 1 + 8 * id;
float val = internal * 1.f;
float sqrt = 0.f;
float divisor = id >= sqrt_overflow_limit ? size * 1.f : 1.f;
val = internal / (divisor * divisor);
sqrt = cl::sycl::sqrt(val) * divisor;
col = static_cast<index_t>((-1 + sqrt) / 2);
row = id - col * (col + 1) / 2;
// adjust the row/col if out of bounds
if (row > col) {
int diff = row - col;
col += diff;
row -= col;
} else if (row < 0) {
col--;
row = id - col * (col + 1) / 2;
}
}

// Row-Col index calculation for Lower Packed Matrix
template <bool Upper>
PORTBLAS_ALWAYS_INLINE static typename std::enable_if<!Upper>::type
compute_row_col(const int64_t id, const index_t size, index_t &row,
index_t &col) {
index_t temp = 2 * size + 1;
int64_t internal = temp * temp - 8 * id;
float val = internal * 1.f;
float sqrt = 0.f;
float divisor = internal >= sqrt_overflow_limit ? 2.f * size : 1.f;
val = internal / (divisor * divisor);
sqrt = cl::sycl::sqrt(val) * divisor;
col = static_cast<index_t>((temp - sqrt) / 2);
row = id - (col * (temp - col)) / 2 + col;
// adjust row-col if out of bounds
if (row < 0 || col < 0 || row >= size || col >= size || row < col) {
index_t diff = id < size || row < col ? -1 : row >= size ? 1 : 0;
col += diff;
row = id - (col * (temp - col)) / 2 + col;
}
}
index_t int_sqrt(int64_t s);
void compute_row_col(const int64_t id, const index_t size, index_t &row,
index_t &col);
};

/*!
@brief Generator/factory for SPR/SPR2 trees.
*/
template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t> make_spr(
lhs_t &lhs_, typename rhs_1_t::index_t _N, typename lhs_t::value_t alpha_,
rhs_1_t &rhs_1_, typename rhs_1_t::index_t incX_1, rhs_2_t &rhs_2_,
typename rhs_1_t::index_t incX_2) {
rhs_1_t &rhs_1_, rhs_2_t &rhs_2_) {
return Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>(lhs_, _N, alpha_, rhs_1_,
incX_1, rhs_2_, incX_2);
rhs_2_);
}

} // namespace blas
Expand Down
8 changes: 4 additions & 4 deletions src/interface/blas2_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -988,12 +988,12 @@ typename sb_handle_t::event_t _spr_impl(
const index_t globalSize = localSize * nWGPerCol;

if (Upper) {
auto spr = make_spr<true, true>(mA, _N, _alpha, vx, _incx, vx, _incx);
auto spr = make_spr<true, true>(mA, _N, _alpha, vx, vx);
return ret = concatenate_vectors(
ret,
sb_handle.execute(spr, localSize, globalSize, _dependencies));
} else {
auto spr = make_spr<true, false>(mA, _N, _alpha, vx, _incx, vx, _incx);
auto spr = make_spr<true, false>(mA, _N, _alpha, vx, vx);
return ret = concatenate_vectors(
ret,
sb_handle.execute(spr, localSize, globalSize, _dependencies));
Expand Down Expand Up @@ -1048,12 +1048,12 @@ typename sb_handle_t::event_t _spr2_impl(
const index_t globalSize = localSize * nWGPerCol;

if (Upper) {
auto spr2 = make_spr<false, true>(mA, _N, _alpha, vx, _incx, vy, _incy);
auto spr2 = make_spr<false, true>(mA, _N, _alpha, vx, vy);
return ret = concatenate_vectors(
ret,
sb_handle.execute(spr2, localSize, globalSize, _dependencies));
} else {
auto spr2 = make_spr<false, false>(mA, _N, _alpha, vx, _incx, vy, _incy);
auto spr2 = make_spr<false, false>(mA, _N, _alpha, vx, vy);
return ret = concatenate_vectors(
ret,
sb_handle.execute(spr2, localSize, globalSize, _dependencies));
Expand Down
95 changes: 80 additions & 15 deletions src/operations/blas2/spr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,64 @@

namespace blas {

/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/

template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
PORTBLAS_INLINE Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::Spr(
lhs_t& _l, typename rhs_1_t::index_t _N, value_t _alpha, rhs_1_t& _r1,
typename rhs_1_t::index_t _incX_1, rhs_2_t& _r2,
typename rhs_1_t::index_t _incX_2)
: lhs_(_l),
N_(_N),
alpha_(_alpha),
rhs_1_(_r1),
incX_1_(_incX_1),
rhs_2_(_r2),
incX_2_(_incX_2) {}
rhs_2_t& _r2)
: lhs_(_l), N_(_N), alpha_(_alpha), rhs_1_(_r1), rhs_2_(_r2) {}

/*!
* @brief Compute the integer square root of an integer value by means of a
* fixed-point iteration method.
*/
template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
PORTBLAS_ALWAYS_INLINE typename rhs_1_t::index_t
Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::int_sqrt(int64_t val) {
using index_t = typename rhs_1_t::index_t;

if (val < 2) return val;

// Compute x0 as 2^(floor(log2(val)/2) + 1)
index_t p = 0;
int64_t tmp = val;
while (tmp) {
++p;
tmp >>= 1;
}
index_t x0 = 2 << (p / 2);
index_t x1 = (x0 + val / x0) / 2;

#pragma unroll 5
while (x1 < x0) {
x0 = x1;
x1 = (x0 + val / x0) / 2;
}
return x0;
}

/*!
* @brief Map a global work-item index to triangular matrix coordinates.
*/
template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
PORTBLAS_ALWAYS_INLINE void
Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col(
const int64_t id, const typename rhs_1_t::index_t size,
typename rhs_1_t::index_t& row, typename rhs_1_t::index_t& col) {
using index_t = typename rhs_1_t::index_t;
if constexpr (isUpper) {
const index_t i = (int_sqrt(8L * id + 1L) - 1) / 2;
col = i;
row = id - (i * (i + 1)) / 2;
} else {
const index_t rid = size * (size + 1) / 2 - id - 1;
const index_t i = (int_sqrt(8L * rid + 1L) - 1) / 2;
col = size - 1 - i;
row = size - 1 - (rid - i * (i + 1) / 2);
}
}

template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
typename rhs_2_t>
Expand All @@ -58,12 +101,34 @@ typename rhs_1_t::value_t Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::eval(

index_t row = 0, col = 0;

if (global_idx < lhs_size) {
value_t lhs_val = lhs_.eval(global_idx);

Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col<isUpper>(
if (!id) {
Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col(
global_idx, N_, row, col);
}

row = sycl::group_broadcast(ndItem.get_group(), row);
col = sycl::group_broadcast(ndItem.get_group(), col);

if (global_idx < lhs_size) {
if constexpr (isUpper) {
if (id) {
row += id;
while (row > col) {
++col;
row -= col;
}
}
} else {
if (id) {
row += id;
while (row >= N_) {
++col;
row = row - N_ + col;
}
}
}

value_t lhs_val = lhs_.eval(global_idx);
value_t rhs_1_val = rhs_1_.eval(row);
value_t rhs_2_val = rhs_2_.eval(col);
if constexpr (!Single) {
Expand Down
23 changes: 10 additions & 13 deletions test/unittest/blas2/blas2_spr2_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@

template <typename scalar_t>
using combination_t =
std::tuple<std::string, char, char, index_t, scalar_t, index_t, index_t>;
std::tuple<std::string, char, index_t, scalar_t, index_t, index_t>;

template <typename scalar_t, helper::AllocType mem_alloc>
void run_test(const combination_t<scalar_t> combi) {
std::string alloc;
index_t n;
scalar_t alpha;
char layout, uplo;
char uplo;
index_t incX, incY;
std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi;
std::tie(alloc, uplo, n, alpha, incX, incY) = combi;

const size_t x_size = 1 + (n - 1) * incX;
const size_t y_size = 1 + (n - 1) * incY;
Expand All @@ -50,11 +50,10 @@ void run_test(const combination_t<scalar_t> combi) {
fill_random(vy_cpu);

// Output matrix
std::vector<scalar_t> a_mp(m_size, 7.0);
std::vector<scalar_t> a_cpu_mp(m_size, 7.0);
std::vector<scalar_t> a_mp(m_size);
fill_random(a_mp);
std::vector<scalar_t> a_cpu_mp = a_mp;

uplo = (uplo == 'u' && layout == 'c') || (uplo == 'l' && layout == 'r') ? 'u'
: 'l';
// SYSTEM SPR2
reference_blas::spr2<scalar_t>(&uplo, n, alpha, vx_cpu.data(), incX,
vy_cpu.data(), incY, a_cpu_mp.data());
Expand Down Expand Up @@ -97,9 +96,9 @@ void run_test(const combination_t<scalar_t> combi) {
std::string alloc;
index_t n;
scalar_t alpha;
char layout, uplo;
char uplo;
index_t incX, incY;
std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi;
std::tie(alloc, uplo, n, alpha, incX, incY) = combi;

if (alloc == "usm") {
#ifdef SB_ENABLE_USM
Expand All @@ -116,7 +115,6 @@ void run_test(const combination_t<scalar_t> combi) {
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values('r', 'c'), // matrix layout
::testing::Values('u', 'l'), // UPLO
::testing::Values(1024, 2048, 4096, 8192, 16384), // n
::testing::Values<scalar_t>(0.0, 1.0, 1.5), // alpha
Expand All @@ -128,7 +126,6 @@ const auto combi =
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values('r', 'c'), // matrix layout
::testing::Values('u', 'l'), // UPLO
::testing::Values(14, 63, 257, 1010), // n
::testing::Values<scalar_t>(1.0), // alpha
Expand All @@ -141,10 +138,10 @@ template <class T>
static std::string generate_name(
const ::testing::TestParamInfo<combination_t<T>>& info) {
std::string alloc;
char layout, uplo;
char uplo;
index_t n, incX, incY;
T alpha;
BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX, incY);
BLAS_GENERATE_NAME(info.param, alloc, uplo, n, alpha, incX, incY);
}

BLAS_REGISTER_TEST_ALL(Spr2, combination_t, combi, generate_name);
Loading

0 comments on commit 46d393c

Please sign in to comment.