diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp index 2f9ac8c0f1b4c..785d95f6f2404 100644 --- a/sycl/include/syclcompat/math.hpp +++ b/sycl/include/syclcompat/math.hpp @@ -1032,8 +1032,26 @@ inline dot_product_acc_t dp2a_lo(T1 a, T2 b, static_assert(detail::is_int32_type && detail::is_int32_type, "[SYCLcompat] dp2a_lo expects 32-bit integers as operands."); #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610 - return __dp2a_lo(a, b, c); + defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 + dot_product_acc_t res; + if constexpr (std::is_signed_v && std::is_signed_v) { + asm volatile("dp2a.lo.s32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_signed_v && std::is_unsigned_v) { + asm volatile("dp2a.lo.s32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_unsigned_v && std::is_signed_v) { + asm volatile("dp2a.lo.u32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else { + asm volatile("dp2a.lo.u32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } + return res; #else dot_product_acc_t res = c; auto va = detail::extract_and_sign_or_zero_extend2(a); @@ -1061,8 +1079,26 @@ inline dot_product_acc_t dp2a_hi(T1 a, T2 b, static_assert(detail::is_int32_type && detail::is_int32_type, "[SYCLcompat] dp2a_hi expects 32-bit integers as operands."); #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610 - return __dp2a_hi(a, b, c); + defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 + dot_product_acc_t res; + if constexpr (std::is_signed_v && std::is_signed_v) { + asm volatile("dp2a.hi.s32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_signed_v && std::is_unsigned_v) { + asm volatile("dp2a.hi.s32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_unsigned_v && std::is_signed_v) { + asm volatile("dp2a.hi.u32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else { + asm volatile("dp2a.hi.u32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } + return res; #else dot_product_acc_t res = c; auto va = detail::extract_and_sign_or_zero_extend2(a); @@ -1088,8 +1124,26 @@ inline dot_product_acc_t dp4a(T1 a, T2 b, dot_product_acc_t c) { static_assert(detail::is_int32_type && detail::is_int32_type, "[SYCLcompat] dp4a expects 32-bit integers as operands."); #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610 - return __dp4a(a, b, c); + defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 + dot_product_acc_t res; + if constexpr (std::is_signed_v && std::is_signed_v) { + asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_signed_v && std::is_unsigned_v) { + asm volatile("dp4a.s32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else if constexpr (std::is_unsigned_v && std::is_signed_v) { + asm volatile("dp4a.u32.s32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } else { + asm volatile("dp4a.u32.u32 %0, %1, %2, %3;" + : "=r"(res) + : "r"(a), "r"(b), "r"(c)); + } + return res; #else dot_product_acc_t res = c; auto va = detail::extract_and_sign_or_zero_extend4(a); diff --git a/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp b/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp index 08da3b957bf94..79c9e4b13d1fc 100644 --- a/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp +++ b/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp @@ -29,7 +29,7 @@ // // ===---------------------------------------------------------------------===// -// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out +// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} %s -o %t.out // RUN: %{run} %t.out #include @@ -39,6 +39,10 @@ template constexpr size_t array_size(T (&)[N]) { return N; } +// TODO(syclcompat-lib-reviewers): Improve the tests to ensure that the +// intrinsics are actually used and the implementation is not defaulting to the +// library implementation in CUDA devices. + template struct TestCaseStorage { T1 a; T2 b;