Skip to content

Commit

Permalink
[SYCL][COMPAT] Added two-way and four-way dot product accumulate (dp4…
Browse files Browse the repository at this point in the history
…a, dp2a) (intel#14032)

This PR adds the dp4a, d2pa_lo and dp2a_hi helpers to the math header in
SYCLcompat.
Created a test case, documentation updated.

---------

Signed-off-by: Alberto Cabrera <[email protected]>
  • Loading branch information
Alcpz authored Jun 10, 2024
1 parent 7708715 commit cf4bff1
Show file tree
Hide file tree
Showing 3 changed files with 439 additions and 0 deletions.
25 changes: 25 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,13 @@ without modulo overflow for vector types.
The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math
operations which accept `sycl::vec<T,2>` arguments representing complex values.

The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned
and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the
two-way 16-bit to 8-bit dot product using the second and first 16 bits of the
second operand, respectively. These three APIs return a single 32-bit value with
the accumulated result, which is unsigned if both operands are `uint32_t` and
signed otherwise.

```cpp
inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
unsigned int shift);
Expand Down Expand Up @@ -1709,6 +1716,24 @@ inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x);

template <typename ValueT> inline ValueT reverse_bits(ValueT a);


template <typename T1, typename T2>
using dot_product_acc_t =
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
uint32_t, int32_t>;

template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
dot_product_acc_t<T1, T2> c);

template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
dot_product_acc_t<T1, T2> c);

template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
dot_product_acc_t<T1, T2> c);
```
`vectorized_binary` computes the `BinaryOperation` for two operands,
Expand Down
114 changes: 114 additions & 0 deletions sycl/include/syclcompat/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,120 @@ inline unsigned vectorized_binary(unsigned a, unsigned b,
return v0;
}

template <typename T1, typename T2>
using dot_product_acc_t =
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
uint32_t, int32_t>;

namespace detail {

template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
return sycl::vec<T, 1>(val)
.template as<sycl::vec<
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
.template convert<T>();
}

template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
return sycl::vec<T, 1>(val)
.template as<sycl::vec<
std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
.template convert<T>();
}

template <typename T>
constexpr bool is_int32_type =
std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>;

} // namespace detail

/// Two-way dot product-accumulate. Calculate and return integer_vector2(
/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c
///
/// \tparam [in] T1 The type of first value.
/// \tparam [in] T2 The type of second value.
/// \param [in] a The first value.
/// \param [in] b The second value.
/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
/// uint32_t else has type int32_t.
/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
/// result.
template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
dot_product_acc_t<T1, T2> c) {
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp2a_lo expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp2a_lo(a, b, c);
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend2(a);
auto vb = detail::extract_and_sign_or_zero_extend4(b);
res += va[0] * vb[0];
res += va[1] * vb[1];
return res;
#endif
}

/// Two-way dot product-accumulate. Calculate and return integer_vector2(
/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c
///
/// \tparam [in] T1 The type of first value.
/// \tparam [in] T2 The type of second value.
/// \param [in] a The first value.
/// \param [in] b The second value.
/// \param [in] c The third value. uint32_t if both T1 and T1 are
/// uint32_t else has type int32_t.
/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
/// result.
template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
dot_product_acc_t<T1, T2> c) {
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp2a_hi expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp2a_hi(a, b, c);
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend2(a);
auto vb = detail::extract_and_sign_or_zero_extend4(b);
res += va[0] * vb[2];
res += va[1] * vb[3];
return res;
#endif
}

/// Four-way byte dot product-accumulate. Calculate and return integer_vector4(
/// \param a) dot product integer_vector4( \param b) + \param c
///
/// \tparam [in] T1 The type of first value.
/// \tparam [in] T2 The type of second value.
/// \param [in] a The first value.
/// \param [in] b The second value.
/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
/// uint32_t else has type int32_t.
/// \return Four-way byte dot product which is accumulated in 32-bit result.
template <typename T1, typename T2>
inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp4a expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp4a(a, b, c);
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend4(a);
auto vb = detail::extract_and_sign_or_zero_extend4(b);
res += va[0] * vb[0];
res += va[1] * vb[1];
res += va[2] * vb[2];
res += va[3] * vb[3];
return res;
#endif
}

/// Extend \p a and \p b to 33 bit and add them.
/// \tparam [in] RetT The type of the return value
/// \tparam [in] AT The type of the first value
Expand Down
Loading

0 comments on commit cf4bff1

Please sign in to comment.