Skip to content

Commit

Permalink
[SYCL][COMPAT] fixed byte-dot products to properly call cuda intrinsi…
Browse files Browse the repository at this point in the history
…cs (#14463)

Fixes the Nvidia backend code path for int8 intrinsics in syclcompat.

* Was using previously __CUDA_ARCH__ instead of __SYCL_CUDA_ARCH__
* Updates the test to compile to the minimum supported architecture
`sm61`

Signed-off-by: Alberto Cabrera <[email protected]>
  • Loading branch information
Alcpz authored Jul 10, 2024
1 parent 93a0ec4 commit 510965a
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 7 deletions.
66 changes: 60 additions & 6 deletions sycl/include/syclcompat/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1032,8 +1032,26 @@ inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp2a_lo expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp2a_lo(a, b, c);
defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
dot_product_acc_t<T1, T2> res;
if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp2a.lo.s32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
asm volatile("dp2a.lo.s32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp2a.lo.u32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else {
asm volatile("dp2a.lo.u32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
}
return res;
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend2(a);
Expand Down Expand Up @@ -1061,8 +1079,26 @@ inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp2a_hi expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp2a_hi(a, b, c);
defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
dot_product_acc_t<T1, T2> res;
if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp2a.hi.s32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
asm volatile("dp2a.hi.s32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp2a.hi.u32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else {
asm volatile("dp2a.hi.u32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
}
return res;
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend2(a);
Expand All @@ -1088,8 +1124,26 @@ inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
"[SYCLcompat] dp4a expects 32-bit integers as operands.");
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
return __dp4a(a, b, c);
defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
dot_product_acc_t<T1, T2> res;
if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
asm volatile("dp4a.s32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
asm volatile("dp4a.u32.s32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
} else {
asm volatile("dp4a.u32.u32 %0, %1, %2, %3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c));
}
return res;
#else
dot_product_acc_t<T1, T2> res = c;
auto va = detail::extract_and_sign_or_zero_extend4(a);
Expand Down
6 changes: 5 additions & 1 deletion sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
//
// ===---------------------------------------------------------------------===//

// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} %s -o %t.out
// RUN: %{run} %t.out

#include <sycl/detail/core.hpp>
Expand All @@ -39,6 +39,10 @@ template <typename T, size_t N> constexpr size_t array_size(T (&)[N]) {
return N;
}

// TODO(syclcompat-lib-reviewers): Improve the tests to ensure that the
// intrinsics are actually used and the implementation is not defaulting to the
// library implementation in CUDA devices.

template <typename T1, typename T2> struct TestCaseStorage {
T1 a;
T2 b;
Expand Down

0 comments on commit 510965a

Please sign in to comment.