Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
YuriPlyakhin committed Feb 18, 2025
1 parent bae7012 commit 3728aee
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 18 deletions.
27 changes: 24 additions & 3 deletions sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
#include <iostream>
#include <sycl/usm.hpp>

template <typename Tab, size_t K, layout B_layout> class mult;
template <typename Tab, size_t TM, size_t TN, size_t TK, layout B_layout>
class mult;

template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
size_t TN, size_t TK, layout A_layout, layout B_layout>
Expand All @@ -18,11 +19,11 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
// Add one iteration for the out of bounds dpas instruction
size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
size_t NDRangeN = N / TN + (((N % TN) != 0) ? 1 : 0);
size_t sg_size = get_sg_size<mult<T2, K, B_layout>>(q);
size_t sg_size = get_sg_size<mult<T2, TM, TN, TK, B_layout>>(q);
std::cout << "SG size: " << sg_size << " ";

q.submit([&](handler &cgh) {
cgh.parallel_for<mult<T2, K, B_layout>>(
cgh.parallel_for<mult<T2, TM, TN, TK, B_layout>>(
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
Expand Down Expand Up @@ -150,9 +151,29 @@ template <layout A_layout, layout B_layout> void test_all() {
test<bfloat16, float, /*MATRIX_M*/ 1024 + 20, /*MATRIX_N*/ 1024 + 20,
/*MATRIX_K*/ 1024 + 24, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, A_layout,
B_layout>();
test<bfloat16, float, 1024 + 20, 1024 + 20, 1024 + 24, 16, 16, 16, A_layout,
B_layout>();
test<bfloat16, float, 1024 + 20, 1024 + 20, 1024 + 24, 1, 64, 16, A_layout,
B_layout>();
test<bfloat16, float, 1024 + 20, 1024 + 20, 1024 + 24, 1, 64, 32, A_layout,
B_layout>();
test<bfloat16, float, 1024 + 20, 1024 + 20, 1024 + 24, 32, 64, 16, A_layout,
B_layout>();
test<bfloat16, float, 1024 + 20, 1024 + 20, 1024 + 24, 32, 64, 32, A_layout,
B_layout>();
std::cout << "half: ";
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 8, 16, 16, A_layout,
B_layout>();
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 16, 16, 16, A_layout,
B_layout>();
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 1, 64, 16, A_layout,
B_layout>();
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 1, 64, 32, A_layout,
B_layout>();
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 32, 64, 16, A_layout,
B_layout>();
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 32, 64, 32, A_layout,
B_layout>();
std::cout << "int8: ";
test<int8_t, int32_t, 1024, 1024 + 20, 1024 + 24, 8, 16, 32, A_layout,
B_layout>();
Expand Down
34 changes: 22 additions & 12 deletions sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,10 @@

#include "common.hpp"

constexpr size_t TM = 8;
constexpr size_t TN = 16;
constexpr size_t TK = 16;
template <typename T, size_t TM, size_t TN, size_t TK> class imatrix;

template <typename T> class imatrix;

template <typename T1, typename T2, size_t M, size_t N, size_t K>
template <size_t TM, size_t TN, size_t TK, typename T1, typename T2, size_t M,
size_t N, size_t K>
void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
big_matrix<T2, K, N> &B) {
size_t NDRangeM = M / TM;
Expand All @@ -45,15 +42,15 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));

queue q;
size_t sg_size = get_sg_size<class imatrix<T2>>(q);
size_t sg_size = get_sg_size<class imatrix<T2, TM, TN, TK>>(q);
std::cout << "subgroup size " << sg_size << " ";

q.submit([&](handler &cgh) {
auto accC = bufC.get_access<access::mode::read_write>(cgh);
auto accA = bufA.template get_access<access::mode::read_write>(cgh);
auto accB = bufB.template get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class imatrix<T2>>(
cgh.parallel_for<class imatrix<T2, TM, TN, TK>>(
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
Expand Down Expand Up @@ -100,10 +97,13 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
}).wait();
}

template <typename T> void test() {
template <typename T, size_t TM, size_t TN, size_t TK> void test() {
std::cout << TM << "x" << TN << "x" << TK << " ";

static constexpr size_t MATRIX_M = TM * 2;
static constexpr size_t MATRIX_N = TN * 2;
static constexpr size_t MATRIX_K = TK * 2;

T A[MATRIX_K][MATRIX_M];
T B[MATRIX_N][MATRIX_K];
float C[MATRIX_M][MATRIX_N];
Expand All @@ -120,7 +120,7 @@ template <typename T> void test() {
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
matrix_multiply(MC, MA, MB);
matrix_multiply<TM, TN, TK>(MC, MA, MB);
matrix_multiply_ref((T *)A, (T *)B, (float *)D, MATRIX_M, MATRIX_N, MATRIX_K,
false, true, true);

Expand All @@ -138,13 +138,23 @@ int main() {
for (auto &combination : combinations) {
if (!bf16_run && combination.atype == matrix_type::bf16) {
std::cout << "bf16 ";
test<bfloat16>();
test<bfloat16, 8, 16, 16>();
test<bfloat16, 16, 16, 16>();
test<bfloat16, 1, 64, 16>();
test<bfloat16, 1, 64, 32>();
test<bfloat16, 32, 64, 16>();
test<bfloat16, 32, 64, 32>();
bf16_run = true;
}

if (!half_run && combination.atype == matrix_type::fp16) {
std::cout << "half ";
test<half>();
test<half, 8, 16, 16>();
test<half, 16, 16, 16>();
test<half, 1, 64, 16>();
test<half, 1, 64, 32>();
test<half, 32, 64, 16>();
test<half, 32, 64, 32>();
half_run = true;
}

Expand Down
22 changes: 19 additions & 3 deletions sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@
#include "common.hpp"
#include <sycl/usm.hpp>

template <typename T, size_t TileRows, size_t TileCols> class MT;
template <typename T, size_t TileRows, size_t TileCols, use Use> class MT;

template <size_t TR, size_t TC, typename T, size_t NR, size_t NC, use Use>
void matrix_transpose(T *in, T *out, queue q) {
static_assert((NR % TR) == 0);
static_assert((NC % TC) == 0);
size_t sg_size = get_sg_size<class MT<T, TR, TC>>(q);
size_t sg_size = get_sg_size<class MT<T, TR, TC, Use>>(q);
std::cout << "SG size " << sg_size << " ";

q.submit([&](handler &cgh) {
cgh.parallel_for<class MT<T, TR, TC>>(
cgh.parallel_for<class MT<T, TR, TC, Use>>(
nd_range<2>({NR / TR, NC / TC * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
Expand Down Expand Up @@ -110,15 +110,31 @@ int main() {
for (auto &combination : combinations) {
if (!bf16_run && combination.atype == matrix_type::bf16) {
std::cout << "bf16:\n";
test<bfloat16, 1, 16, use::a>();
test<bfloat16, 1, 32, use::a>();
test<bfloat16, 8, 16, use::a>();
test<bfloat16, 16, 16, use::a>();
test<bfloat16, 32, 16, use::a>();
test<bfloat16, 32, 32, use::a>();

test<bfloat16, 16, 16, use::b>();
test<bfloat16, 16, 64, use::b>();
test<bfloat16, 32, 64, use::b>();
bf16_run = true;
}

if (!half_run && combination.atype == matrix_type::fp16) {
std::cout << "half:\n";
test<half, 1, 16, use::a>();
test<half, 1, 32, use::a>();
test<half, 8, 16, use::a>();
test<half, 16, 16, use::a>();
test<half, 32, 16, use::a>();
test<half, 32, 32, use::a>();

test<half, 16, 16, use::b>();
test<half, 16, 64, use::b>();
test<half, 32, 64, use::b>();
half_run = true;
}

Expand Down

0 comments on commit 3728aee

Please sign in to comment.