[SYCL][COMPAT] Added two-way and four-way dot product accumulate (dp4…

…a, dp2a) (intel#14032) This PR adds the dp4a, d2pa_lo and dp2a_hi helpers to the math header in SYCLcompat. Created a test case, documentation updated. --------- Signed-off-by: Alberto Cabrera <[email protected]>
OuadiElfarouki · Jun 10, 2024 · cf4bff1 · cf4bff1
1 parent 7708715
commit cf4bff1
Show file tree

Hide file tree

Showing 3 changed files with 439 additions and 0 deletions.
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
@@ -1528,6 +1528,13 @@ without modulo overflow for vector types.
 The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math
 operations which accept `sycl::vec<T,2>` arguments representing complex values.
 
+The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned
+and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the
+two-way 16-bit to 8-bit dot product using the second and first 16 bits of the
+second operand, respectively. These three APIs return a single 32-bit value with
+the accumulated result, which is unsigned if both operands are `uint32_t` and
+signed otherwise.
+
 ```cpp
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift); 
@@ -1709,6 +1716,24 @@ inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
 template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x);
 
 template <typename ValueT> inline ValueT reverse_bits(ValueT a);
+
+
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c);
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c);
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
+                                      dot_product_acc_t<T1, T2> c);
 ```
 
 `vectorized_binary` computes the `BinaryOperation` for two operands,

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
@@ -800,6 +800,120 @@ inline unsigned vectorized_binary(unsigned a, unsigned b,
   return v0;
 }
 
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+namespace detail {
+
+template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+      .template convert<T>();
+}
+
+template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
+      .template convert<T>();
+}
+
+template <typename T>
+constexpr bool is_int32_type =
+    std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>;
+
+} // namespace detail
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp2a_lo expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp2a_lo(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  return res;
+#endif
+}
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp2a_hi expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp2a_hi(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[2];
+  res += va[1] * vb[3];
+  return res;
+#endif
+}
+
+/// Four-way byte dot product-accumulate. Calculate and return integer_vector4(
+/// \param a) dot product integer_vector4( \param b)  + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Four-way byte dot product which is accumulated in 32-bit result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp4a expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend4(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  res += va[2] * vb[2];
+  res += va[3] * vb[3];
+  return res;
+#endif
+}
+
 /// Extend \p a and \p b to 33 bit and add them.
 /// \tparam [in] RetT The type of the return value
 /// \tparam [in] AT The type of the first value