diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b0143e64..e0d418c57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,6 +212,7 @@ export(EXPORT portblas
 
 option(BLAS_ENABLE_TESTING "Whether to enable testing" ON)
 option(ENABLE_EXPRESSION_TESTS "Whether to build expression tree fusion tests" OFF)
+option(ENABLE_JOINTMATRIX_TESTS "Whether to build joint_matrix GEMM tests" OFF)
 if (INSTALL_HEADER_ONLY AND BLAS_ENABLE_TESTING)
   message(STATUS "Tests are disabled when installing portBLAS in header only mode")
   set(BLAS_ENABLE_TESTING OFF)
diff --git a/README.md b/README.md
index 5186b98d0..fb210470d 100644
--- a/README.md
+++ b/README.md
@@ -458,6 +458,7 @@ Some of the supported options are:
 | `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` |
 | `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) |
 | `ENABLE_EXPRESSION_TESTS` | `ON`/`OFF` | Build additional tests that use the header-only framework (e.g to test expression trees); `OFF` by default |
+| `ENABLE_JOINTMATRIX_TESTS` | `ON`/`OFF` | Build additional tests that use joint_matrix extension; `OFF` by default |
 | `BLAS_VERIFY_BENCHMARK` | `ON`/`OFF` | Verify the results of the benchmarks instead of only measuring the performance. See the documentation of the benchmarks for more details. `ON` by default |
 | `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` |  Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
 | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
diff --git a/benchmark/portblas/blas3/trsm.cpp b/benchmark/portblas/blas3/trsm.cpp
index 0afd170ec..8d28cec4c 100644
--- a/benchmark/portblas/blas3/trsm.cpp
+++ b/benchmark/portblas/blas3/trsm.cpp
@@ -97,7 +97,13 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side,
   }
 
   std::ostringstream err_stream;
-  if (!utils::compare_vectors(b_temp, x_ref, err_stream, "")) {
+  const char* en_joint_matrix = std::getenv("SB_ENABLE_JOINT_MATRIX");
+  if (!utils::compare_vectors(b_temp, x_ref, err_stream, "",
+                              (en_joint_matrix != NULL) &&
+                                      (std::is_same<scalar_t, float>::value) &&
+                                      (*en_joint_matrix == '1')
+                                  ? 2
+                                  : 1)) {
     const std::string& err_str = err_stream.str();
     state.SkipWithError(err_str.c_str());
     *success = false;
@@ -181,8 +187,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
     };
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
-            side, uplo, trans, diag, m, n,
-            mem_type).c_str(),
+            side, uplo, trans, diag, m, n, mem_type)
+            .c_str(),
         BM_lambda, sb_handle_ptr, side, uplo, trans, diag, m, n, alpha, success)
         ->UseRealTime();
   }
@@ -193,7 +199,8 @@ void register_benchmark(blas_benchmark::Args& args,
                         blas::SB_Handle* sb_handle_ptr, bool* success) {
   auto trsm_params = blas_benchmark::utils::get_trsm_params<scalar_t>(args);
   register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
-      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, trsm_params);
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      trsm_params);
 #ifdef SB_ENABLE_USM
   register_benchmark<scalar_t, blas::helper::AllocType::usm>(
       sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, trsm_params);
@@ -201,8 +208,8 @@ void register_benchmark(blas_benchmark::Args& args,
 }
 
 namespace blas_benchmark {
-void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
-                      bool* success) {
+void create_benchmark(blas_benchmark::Args& args,
+                      blas::SB_Handle* sb_handle_ptr, bool* success) {
   BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
 }
 }  // namespace blas_benchmark
diff --git a/common/include/common/float_comparison.hpp b/common/include/common/float_comparison.hpp
index 3cfc0885e..5c634279e 100644
--- a/common/include/common/float_comparison.hpp
+++ b/common/include/common/float_comparison.hpp
@@ -115,17 +115,20 @@ scalar_t clamp_to_limits(scalar_t v) {
  * Indicates the tolerated margin for relative differences
  */
 template <typename scalar_t>
-inline scalar_t getRelativeErrorMargin() {
+inline scalar_t getRelativeErrorMargin(const int32_t margin_multiplier = 1) {
   /* Measured empirically with gemm. The dimensions of the matrices (even k)
    * don't seem to have an impact on the observed relative differences
    * In the cases where the relative error is relevant (non close to zero),
    * relative differences of up to 0.002 were observed for float
    */
-  return static_cast<scalar_t>(0.005);
+  scalar_t margin = 0.005;
+  // increase error margin for mixed precision calculation
+  // for trsm operator.
+  return margin * margin_multiplier;
 }
 
 template <>
-inline double getRelativeErrorMargin<double>() {
+inline double getRelativeErrorMargin<double>(const int32_t) {
   /* Measured empirically with gemm. The dimensions of the matrices (even k)
    * don't seem to have an impact on the observed relative differences
    * In the cases where the relative error is relevant (non close to zero),
@@ -135,7 +138,7 @@ inline double getRelativeErrorMargin<double>() {
 }
 
 template <>
-inline cl::sycl::half getRelativeErrorMargin<cl::sycl::half>() {
+inline cl::sycl::half getRelativeErrorMargin<cl::sycl::half>(const int32_t) {
   // Measured empirically with gemm
   return 0.05f;
 }
@@ -145,16 +148,19 @@ inline cl::sycl::half getRelativeErrorMargin<cl::sycl::half>() {
  * scalars are close to 0)
  */
 template <typename scalar_t>
-inline scalar_t getAbsoluteErrorMargin() {
+inline scalar_t getAbsoluteErrorMargin(const int32_t margin_multiplier = 1) {
   /* Measured empirically with gemm.
    * In the cases where the relative error is irrelevant (close to zero),
    * absolute differences of up to 0.0006 were observed for float
    */
-  return 0.001f;
+  scalar_t margin = 0.001f;
+  // increase error margin for mixed precision calculation
+  // for trsm operator.
+  return margin * margin_multiplier;
 }
 
 template <>
-inline double getAbsoluteErrorMargin<double>() {
+inline double getAbsoluteErrorMargin<double>(const int32_t) {
   /* Measured empirically with gemm.
    * In the cases where the relative error is irrelevant (close to zero),
    * absolute differences of up to 10^-12 were observed for double
@@ -163,7 +169,7 @@ inline double getAbsoluteErrorMargin<double>() {
 }
 
 template <>
-inline cl::sycl::half getAbsoluteErrorMargin<cl::sycl::half>() {
+inline cl::sycl::half getAbsoluteErrorMargin<cl::sycl::half>(const int32_t) {
   // Measured empirically with gemm.
   return 1.0f;
 }
@@ -172,7 +178,8 @@ inline cl::sycl::half getAbsoluteErrorMargin<cl::sycl::half>() {
  * Compare two scalars and returns false if the difference is not acceptable.
  */
 template <typename scalar_t, typename epsilon_t = scalar_t>
-inline bool almost_equal(scalar_t const& scalar1, scalar_t const& scalar2) {
+inline bool almost_equal(scalar_t const& scalar1, scalar_t const& scalar2,
+                         const int32_t margin_multiplier = 1) {
   // Shortcut, also handles case where both are zero
   if (scalar1 == scalar2) {
     return true;
@@ -187,12 +194,14 @@ inline bool almost_equal(scalar_t const& scalar1, scalar_t const& scalar2) {
 
   // Close to zero, the relative error doesn't work, use absolute error
   if (scalar1 == scalar_t{0} || scalar2 == scalar_t{0} ||
-      absolute_diff < getAbsoluteErrorMargin<epsilon_t>()) {
-    return (absolute_diff < getAbsoluteErrorMargin<epsilon_t>());
+      absolute_diff < getAbsoluteErrorMargin<epsilon_t>(margin_multiplier)) {
+    return (absolute_diff <
+            getAbsoluteErrorMargin<epsilon_t>(margin_multiplier));
   }
   // Use relative error
   const auto absolute_sum = utils::abs(scalar1) + utils::abs(scalar2);
-  return (absolute_diff / absolute_sum) < getRelativeErrorMargin<epsilon_t>();
+  return (absolute_diff / absolute_sum) <
+         getRelativeErrorMargin<epsilon_t>(margin_multiplier);
 }
 
 /**
@@ -206,7 +215,8 @@ template <typename scalar_t, typename epsilon_t = scalar_t>
 inline bool compare_vectors(std::vector<scalar_t> const& vec,
                             std::vector<scalar_t> const& ref,
                             std::ostream& err_stream = std::cerr,
-                            std::string end_line = "\n") {
+                            std::string end_line = "\n",
+                            const int32_t margin_multiplier = 1) {
   if (vec.size() != ref.size()) {
     err_stream << "Error: tried to compare vectors of different sizes"
                << std::endl;
@@ -214,7 +224,7 @@ inline bool compare_vectors(std::vector<scalar_t> const& vec,
   }
 
   for (int i = 0; i < vec.size(); ++i) {
-    if (!almost_equal<scalar_t, epsilon_t>(vec[i], ref[i])) {
+    if (!almost_equal<scalar_t, epsilon_t>(vec[i], ref[i], margin_multiplier)) {
       err_stream << "Value mismatch at index " << i << ": " << vec[i]
                  << "; expected " << ref[i] << end_line;
       return false;
diff --git a/src/operations/blas3/gemm_load_store_joint_matrix.hpp b/src/operations/blas3/gemm_load_store_joint_matrix.hpp
index 81eb0625e..876817158 100644
--- a/src/operations/blas3/gemm_load_store_joint_matrix.hpp
+++ b/src/operations/blas3/gemm_load_store_joint_matrix.hpp
@@ -57,18 +57,16 @@ struct PacketizeJointMatrix {
 
   /*! @brief Performs a coalesced non-vectorized load when the current block is
    * not internal.
-   * @tparam trans Whether the source matrix is transposed or not.
    * @tparam internal True if the current block is internal and no bounds
    * checking is required.
-   * @tparam ld The leading dimension of the destination memory.
    */
 
-  template <bool trans, bool internal, int ld, typename SrcPointerType,
-            typename DestPointerType, typename EdgePredicate>
+  template <bool internal, typename SrcPointerType, typename DestPointerType,
+            typename EdgePredicate>
   static PORTBLAS_INLINE typename std::enable_if<!internal>::type load(
       const bool in_range, SrcPointerType src, DestPointerType dest,
       EdgePredicate) {
-    value_t val = in_range ? *(src) : value_t{0};
+    value_t val = in_range ? *src : value_t{0};
     using address_t = cl::sycl::access::address_space;
     if constexpr (std::is_same<cl::sycl::multi_ptr<cl::sycl::half,
                                                    address_t::local_space>,
@@ -79,93 +77,96 @@ struct PacketizeJointMatrix {
                                           cl::sycl::ext::oneapi::bfloat16,
                                           address_t::local_space>,
                                       DestPointerType>::value) {
-      using dtype = cl::sycl::ext::oneapi::bfloat16;
-      *dest = static_cast<dtype>(val);
+      using namespace cl::sycl::ext::oneapi;
+      *dest = bfloat16(val);
     } else {
       using namespace cl::sycl::ext::oneapi::experimental::matrix;
       *dest = round_to_tf32(val);
     }
   }
+
   /*! @brief Performs a vectorised load using sycl::vec::load when the current
    * block is internal. In the case where k < the
    * number of elements being loaded then edge loads will be element wise with
    * additional bounds checking.
-   * @tparam trans Whether the source matrix is transposed or not.
    * @tparam internal True if the current block is internal and no bounds
    * checking is required.
-   * @tparam ld The leading dimension of the destination memory. */
-  template <bool trans, bool internal, index_t ld, typename SrcPointerType,
-            typename DestPointerType, typename EdgePredicate>
+   */
+  template <bool internal, typename SrcPointerType, typename DestPointerType,
+            typename EdgePredicate>
   static PORTBLAS_INLINE typename std::enable_if<internal>::type load(
       const bool in_range, SrcPointerType src, DestPointerType dest,
       EdgePredicate edge_in_range) {
     PacketType packet{};
 
+    using address_t = cl::sycl::access::address_space;
     if (in_range) {
-      using address_t = cl::sycl::access::address_space;
       packet.template load<address_t::global_space>(
           0, cl::sycl::multi_ptr<const value_t, address_t::global_space>(src));
+      store(packet, dest);
     } else {
+      // avoid writing to variable, instead directly write to
+      // shared local memory to avoid race condition experienced
+      // with release compiler.
 #pragma unroll
-      for (index_t i = 0; i < packet_size; i++) {
-        reinterpret_cast<value_t *>(&packet)[i] =
-            edge_in_range(i) ? *(src + i) : value_t{0};
-      }
-    }
-    store<trans, ld>(packet, dest);
-  }
-  /*! @brief Store a vector packet into local memory when the source is
-   * transposed. This will untranspose the elements individually when storing so
-   * the data in local memory is always consistent.
-   * @tparam trans Whether the source matrix is transposed or not.
-   * @tparam ld The leading dimension of the destination memory.*/
-  template <bool trans, index_t ld, typename DestPointerType>
-  static PORTBLAS_INLINE typename std::enable_if<trans>::type store(
-      PacketType &packet, DestPointerType dest) {
-    using address_t = cl::sycl::access::address_space;
-#pragma unroll
-    for (index_t i = 0; i < packet_size; i++) {
-      value_t val = reinterpret_cast<value_t *>(&packet)[i];
-      if constexpr (std::is_same<cl::sycl::multi_ptr<cl::sycl::half,
-                                                     address_t::local_space>,
-                                 DestPointerType>::value) {
-        using dtype = cl::sycl::half;
-        *(dest + ld * i) = static_cast<dtype>(val);
-      } else if constexpr (std::is_same<cl::sycl::multi_ptr<
-                                            cl::sycl::ext::oneapi::bfloat16,
-                                            address_t::local_space>,
-                                        DestPointerType>::value) {
-        using dtype = cl::sycl::ext::oneapi::bfloat16;
-        *(dest + ld * i) = static_cast<dtype>(val);
-      } else {
-        using namespace cl::sycl::ext::oneapi::experimental::matrix;
-        *(dest + ld * i) = round_to_tf32(val);
+      for (index_t i = 0; i < packet_size; i++, dest++, src++) {
+        if constexpr (std::is_same<cl::sycl::multi_ptr<cl::sycl::half,
+                                                       address_t::local_space>,
+                                   DestPointerType>::value) {
+          using dtype = cl::sycl::half;
+          *dest = static_cast<dtype>(edge_in_range(i) ? *src : 0);
+        } else if constexpr (std::is_same<cl::sycl::multi_ptr<
+                                              cl::sycl::ext::oneapi::bfloat16,
+                                              address_t::local_space>,
+                                          DestPointerType>::value) {
+          using namespace cl::sycl::ext::oneapi;
+          *dest = bfloat16(edge_in_range(i) ? *src : 0.f);
+        } else {
+          using namespace cl::sycl::ext::oneapi::experimental::matrix;
+          *dest = edge_in_range(i) ? round_to_tf32(*src) : 0.f;
+        }
       }
     }
   }
 
-  /*! @brief Store a vector packet into local memory when the source is not
-   * transposed. This will use sycl::vec::store function.
-   * @tparam trans Whether the source matrix is transposed or not.
-   * @tparam ld The leading dimension of the destination memory.*/
-  template <bool trans, int ld, typename DestPointerType>
-  static PORTBLAS_INLINE typename std::enable_if<!trans>::type store(
-      PacketType &packet, DestPointerType dest) {
+  /*! @brief Store a vector packet into local memory. This will use
+   *  sycl::vec::store function.
+   */
+  template <typename DestPointerType>
+  static PORTBLAS_INLINE void store(PacketType &packet, DestPointerType dest) {
     using address_t = cl::sycl::access::address_space;
     if constexpr (std::is_same<cl::sycl::multi_ptr<cl::sycl::half,
                                                    address_t::local_space>,
                                DestPointerType>::value) {
       using dtype = cl::sycl::half;
-      *dest = static_cast<dtype>(packet[0]);
+      cl::sycl::vec<dtype, vector_size> new_vec{};
+      for (index_t i = 0; i < packet_size; i++) {
+        reinterpret_cast<dtype *>(&new_vec)[i] =
+            static_cast<dtype>(reinterpret_cast<value_t *>(&packet)[i]);
+      }
+      new_vec.template store<address_t::local_space>(
+          0, cl::sycl::multi_ptr<dtype, address_t::local_space>(dest));
     } else if constexpr (std::is_same<cl::sycl::multi_ptr<
                                           cl::sycl::ext::oneapi::bfloat16,
                                           address_t::local_space>,
                                       DestPointerType>::value) {
-      using dtype = cl::sycl::ext::oneapi::bfloat16;
-      *dest = static_cast<dtype>(packet[0]);
+      // sycl::vec doesn't accept bfloat16 as a valid input type
+      // so we need to write the packet elements individually to
+      // the shared memory.
+      using namespace cl::sycl::ext::oneapi;
+      for (index_t i = 0; i < packet_size; i++, dest++) {
+        *dest = bfloat16(reinterpret_cast<value_t *>(&packet)[i]);
+      }
     } else {
       using namespace cl::sycl::ext::oneapi::experimental::matrix;
-      *dest = round_to_tf32(packet[0]);
+      using dtype = float;
+      cl::sycl::vec<dtype, vector_size> new_vec;
+      for (index_t i = 0; i < packet_size; i++) {
+        reinterpret_cast<dtype *>(&new_vec)[i] =
+            round_to_tf32(reinterpret_cast<value_t *>(&packet)[i]);
+      }
+      new_vec.template store<address_t::local_space>(
+          0, cl::sycl::multi_ptr<dtype, address_t::local_space>(dest));
     }
   }
 };
diff --git a/src/operations/blas3/gemm_local_joint_matrix.hpp b/src/operations/blas3/gemm_local_joint_matrix.hpp
index 664eed416..440229fef 100644
--- a/src/operations/blas3/gemm_local_joint_matrix.hpp
+++ b/src/operations/blas3/gemm_local_joint_matrix.hpp
@@ -83,7 +83,6 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   using value_t = element_t;
   using index_t = typename std::make_signed<typename input_t::index_t>::type;
   using packetize_t = PacketizeJointMatrix<VectorSize, value_t, index_t>;
-  using vector_t = typename packetize_t::PacketType;
   using address_t = cl::sycl::access::address_space;
 
   // enable easier access to tile dimensions
@@ -156,20 +155,31 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   static_assert(std::is_same<value_t, float>::value,
                 "This code is only supported for float data type.");
 
+  static_assert(VectorSize == 1,
+                "Vectorization not supported for joint_matrix.");
+
+  static_assert(
+      (frags_per_sg > 1 && jm_row_frags == num_sub_groups) ||
+          (frags_per_sg == 1 && num_jm_frags == num_sub_groups),
+      "Joint Matrix Row Fragments needs to map 1:1 with total sub_groups.");
+
   //! @brief leading dimension of block of A in local
   static constexpr index_t ldsa =
-      tile_type::joint_matrix_M +
-      nbc_a * tile_type::joint_matrix_K / sizeof(float) * 2;
+      (trans_a ? cl_elems : block_rows) +
+      nbc_a * tile_type::joint_matrix_M / sizeof(float) * 2;
   //! @brief leading dimension of block of B in local
   static constexpr index_t ldsb =
-      tile_type::joint_matrix_K +
+      (trans_b ? block_cols : cl_elems) +
       nbc_b * tile_type::joint_matrix_K / sizeof(float) * 2;
-  //! @brief size (in elements) of local (local) memory required by each
+  //! @brief leading dimension of block of output C in local
+  static constexpr index_t ldsc = block_rows + (nbc_a | nbc_b) *
+                                                   tile_type::joint_matrix_M /
+                                                   sizeof(float) * 2;
+  //! @brief size (in elements) of local memory required by each
   //         work group
   static constexpr index_t local_memory_size =
-      (double_buffer + 1) *
-      (ldsa * cl_elems * (block_rows / tile_type::joint_matrix_M) +
-       ldsb * block_cols * (cl_elems / tile_type::joint_matrix_K));
+      (double_buffer + 1) * (ldsa * (trans_a ? block_rows : cl_elems) +
+                             ldsb * (trans_b ? cl_elems : block_cols));
 
   input_t a_;
   input_t b_;
@@ -182,8 +192,8 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   index_t stridec_;
 
   PORTBLAS_INLINE Gemm(input_t A, input_t B, output_t C, element_t alpha,
-                        element_t beta, index_t batch_size, index_t stride_a,
-                        index_t stride_b, index_t stride_c)
+                       element_t beta, index_t batch_size, index_t stride_a,
+                       index_t stride_b, index_t stride_c)
       : a_(A),
         b_(B),
         c_(C),
@@ -262,7 +272,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
    */
   template <typename local_memory_t>
   PORTBLAS_INLINE void eval(local_memory_t scratch_acc,
-                             const cl::sycl::nd_item<1> &id) noexcept {
+                            const cl::sycl::nd_item<1> &id) noexcept {
     index_t m = a_.get_size_row();
     index_t n = b_.get_size_col();
     index_t k = a_.get_size_col();
@@ -270,10 +280,12 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
     const index_t lda = a_.getSizeL();
     const index_t ldb = b_.getSizeL();
     const index_t ldc = c_.getSizeL();
+
+    const index_t wg_id = static_cast<index_t>(id.get_group(0));
     // The batch index that each workgroup should start working with
     const index_t x_groups = (get_wg_x_cluster() - 1) / jm_row_frags + 1;
     const index_t y_groups = (get_wg_y_cluster() - 1) / jm_col_frags + 1;
-    const index_t wg_batch_id = id.get_group(0) / (x_groups * y_groups);
+    const index_t wg_batch_id = wg_id / (x_groups * y_groups);
     // This will disable all workgroups that dont have any batch to work on
     if (wg_batch_id >= batch_size_) {
       return;
@@ -283,20 +295,12 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
     auto scratch = scratch_acc.localAcc.get_pointer();
 
     // The number of work-group required to executed each batch efficiently
-    const index_t wg_id_x = id.get_group(0) % x_groups;
-    const index_t wg_id_y = (id.get_group(0) / x_groups) % y_groups;
+    const index_t wg_id_x = wg_id % x_groups;
+    const index_t wg_id_y = (wg_id / x_groups) % y_groups;
 
-    const index_t a_size = trans_a ? m * lda : k * lda;
-    const index_t b_size = trans_b ? ldb * k : n * ldb;
-    const index_t c_size = ldc * n;
-
-    using address_t = cl::sycl::access::address_space;
-    auto ptr_A = cl::sycl::multi_ptr<const element_t, address_t::global_space>
-        (a_.get_pointer()) + (wg_batch_id * stridea_);
-    auto ptr_B = cl::sycl::multi_ptr<const element_t, address_t::global_space>
-        (b_.get_pointer()) + (wg_batch_id * strideb_);
-    auto ptr_C = cl::sycl::multi_ptr<element_t, address_t::global_space>
-        (c_.get_pointer()) + (wg_batch_id * stridec_);
+    auto ptr_A = a_.get_pointer() + wg_batch_id * stridea_;
+    auto ptr_B = b_.get_pointer() + wg_batch_id * strideb_;
+    auto ptr_C = c_.get_pointer() + wg_batch_id * stridec_;
 
     auto sg = id.get_sub_group();
     const index_t sg_id = sg.get_group_linear_id();
@@ -307,9 +311,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
     const bool out_of_range = (wg_row >= m || wg_col >= n);
     const bool internal = m - wg_row >= block_rows && n - wg_col >= block_cols;
 
-    ptr_C +=
-        (wg_row + (sg_id % jm_row_frags) * tile_type::joint_matrix_M) +
-        (wg_col + (sg_id / jm_row_frags) * tile_type::joint_matrix_N) * ldc;
+    ptr_C += (wg_row + wg_col * ldc);
 
     const index_t mc = m - wg_row;
     const index_t nc = n - wg_col;
@@ -323,8 +325,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
     const index_t it_mod_cl = item_id % cl_elems;
     const index_t it_div_cl = item_id / cl_elems;
 
-    ptr_B += (trans_b ? it_div_bcols * ldb +
-                            (wg_col + it_mod_bcols)
+    ptr_B += (trans_b ? it_div_bcols * ldb + (wg_col + it_mod_bcols)
                       : it_mod_cl + (wg_col + it_div_cl) * ldb);
 
     n = n - wg_col - (trans_b ? it_mod_bcols : it_div_cl);
@@ -333,27 +334,24 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
 
     m = m - wg_row - (trans_a ? it_div_cl : it_mod_brows);
 
-    const index_t s1_offset =
-        (trans_b ? it_div_bcols + it_mod_bcols * ldsb
-                 : it_mod_cl % tile_type::joint_matrix_K +
-                       (it_mod_cl / tile_type::joint_matrix_K) *
-                           (ldsb * block_cols) +
-                       it_div_cl *ldsb);
+    const index_t s1_offset = (trans_b ? it_div_bcols * ldsb + it_mod_bcols
+                                       : it_mod_cl + it_div_cl * ldsb);
 
     const index_t s2_offset =
-        (sg_id / jm_row_frags) * tile_type::joint_matrix_N * ldsb;
+        (sg_id / jm_row_frags) * (trans_b ? tile_type::joint_matrix_N
+                                          : tile_type::joint_matrix_N * ldsb);
 
-    const index_t ofs = (double_buffer + 1) * ldsb * block_cols *
-                        (cl_elems / tile_type::joint_matrix_K);
+    const index_t ofs =
+        (double_buffer + 1) * ldsb * (trans_b ? cl_elems : block_cols);
 
     const index_t s3_offset =
-        ofs + (trans_a ? it_div_cl + (it_mod_cl)*ldsa
-                       : it_mod_brows % tile_type::joint_matrix_M +
-                             (it_mod_brows / tile_type::joint_matrix_M) *
-                                 (ldsa * cl_elems) +
-                             it_div_brows * ldsa);
+        ofs + (trans_a ? it_div_cl * ldsa + it_mod_cl
+                       : it_mod_brows + it_div_brows * ldsa);
 
-    const index_t s4_offset = ofs + (sg_id % jm_row_frags) * (ldsa * cl_elems);
+    const index_t s4_offset =
+        ofs + (sg_id % jm_row_frags) * (trans_a
+                                            ? tile_type::joint_matrix_M * ldsa
+                                            : tile_type::joint_matrix_M);
 
     if constexpr (std::is_same<typename tile_type::jmInpType,
                                cl::sycl::ext::oneapi::experimental::matrix::
@@ -365,14 +363,14 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
 
       if (internal) {
         compute_panel_gemm<double_buffer, false, false>(
-            id, item_id, m, n, k, mc, nc, a_size, b_size, c_size, ptr_A, lda,
-            ptr_B, ldb, ptr_C, ldc, scratch, s1, s2, s3, s4, out_of_range,
-            batch_stride, wg_batch_id, batch_size_);
+            id, item_id, m, n, k, mc, nc, ptr_A, lda, ptr_B, ldb, ptr_C, ldc,
+            scratch, s1, s2, s3, s4, out_of_range, batch_stride, wg_batch_id,
+            batch_size_);
       } else {
         compute_panel_gemm<double_buffer, true, true>(
-            id, item_id, m, n, k, mc, nc, a_size, b_size, c_size, ptr_A, lda,
-            ptr_B, ldb, ptr_C, ldc, scratch, s1, s2, s3, s4, out_of_range,
-            batch_stride, wg_batch_id, batch_size_);
+            id, item_id, m, n, k, mc, nc, ptr_A, lda, ptr_B, ldb, ptr_C, ldc,
+            scratch, s1, s2, s3, s4, out_of_range, batch_stride, wg_batch_id,
+            batch_size_);
       }
     } else {
       auto input_scratch = *reinterpret_cast<cl::sycl::multi_ptr<
@@ -384,14 +382,14 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
       auto s4 = input_scratch + s4_offset;
       if (internal) {
         compute_panel_gemm<double_buffer, false, false>(
-            id, item_id, m, n, k, mc, nc, a_size, b_size, c_size, ptr_A, lda,
-            ptr_B, ldb, ptr_C, ldc, scratch, s1, s2, s3, s4, out_of_range,
-            batch_stride, wg_batch_id, batch_size_);
+            id, item_id, m, n, k, mc, nc, ptr_A, lda, ptr_B, ldb, ptr_C, ldc,
+            scratch, s1, s2, s3, s4, out_of_range, batch_stride, wg_batch_id,
+            batch_size_);
       } else {
         compute_panel_gemm<double_buffer, true, true>(
-            id, item_id, m, n, k, mc, nc, a_size, b_size, c_size, ptr_A, lda,
-            ptr_B, ldb, ptr_C, ldc, scratch, s1, s2, s3, s4, out_of_range,
-            batch_stride, wg_batch_id, batch_size_);
+            id, item_id, m, n, k, mc, nc, ptr_A, lda, ptr_B, ldb, ptr_C, ldc,
+            scratch, s1, s2, s3, s4, out_of_range, batch_stride, wg_batch_id,
+            batch_size_);
       }
     }
   }
@@ -430,8 +428,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   PORTBLAS_INLINE void compute_panel_gemm(
       const cl::sycl::nd_item<1> &id, const index_t &item_id, const index_t &m,
       const index_t &n, const index_t &orig_k, const index_t &mc,
-      const index_t &nc, const index_t &a_size, const index_t &b_size,
-      const index_t &c_size, InputPointerType orig_A, const index_t &lda,
+      const index_t &nc, InputPointerType orig_A, const index_t &lda,
       InputPointerType orig_B, const index_t &ldb, OutputPointerType orig_C,
       const index_t &ldc, OutputScratchPointerType s0,
       InputScratchPointerType s1, InputScratchPointerType s2,
@@ -454,16 +451,15 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
         extract_input_blocks<check_m_limit, check_n_limit, false>(
             item_id, m, n, k, A, lda, B, ldb, s1, s3, out_of_range);
         id.barrier(cl::sycl::access::fence_space::local_space);
-        compute_block_gemm<check_m_limit, check_n_limit>(id, s2, s4, reg_res);
+        compute_block_gemm(id, s2, s4, reg_res);
         A += cl_elems * (trans_a ? 1 : lda);
         B += cl_elems * (trans_b ? ldb : 1);
 
-        sync_smem<double_buffer,
-                  ldsb * block_cols *(cl_elems / tile_type::joint_matrix_K),
-                  ldsb * block_cols *(cl_elems / tile_type::joint_matrix_K),
-                  ldsa * cl_elems *(block_rows / tile_type::joint_matrix_M),
-                  ldsa * cl_elems *(block_rows / tile_type::joint_matrix_M)>(
-            id, ofs, s1, s2, s3, s4);
+        sync_smem<double_buffer, ldsb *(trans_b ? cl_elems : block_cols),
+                  ldsb *(trans_b ? cl_elems : block_cols),
+                  ldsa *(trans_a ? block_rows : cl_elems),
+                  ldsa *(trans_a ? block_rows : cl_elems)>(id, ofs, s1, s2, s3,
+                                                           s4);
         k -= cl_elems;
       }
 
@@ -471,14 +467,13 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
         extract_input_blocks<check_m_limit, check_n_limit, true>(
             item_id, m, n, k, A, lda, B, ldb, s1, s3, out_of_range);
         id.barrier(cl::sycl::access::fence_space::local_space);
-        compute_block_gemm<check_m_limit, check_n_limit>(id, s2, s4, reg_res);
-
-        sync_smem<double_buffer,
-                  ldsb * block_cols *(cl_elems / tile_type::joint_matrix_K),
-                  ldsb * block_cols *(cl_elems / tile_type::joint_matrix_K),
-                  ldsa * cl_elems *(block_rows / tile_type::joint_matrix_M),
-                  ldsa * cl_elems *(block_rows / tile_type::joint_matrix_M)>(
-            id, ofs, s1, s2, s3, s4);
+        compute_block_gemm(id, s2, s4, reg_res);
+
+        sync_smem<double_buffer, ldsb *(trans_b ? cl_elems : block_cols),
+                  ldsb *(trans_b ? cl_elems : block_cols),
+                  ldsa *(trans_a ? block_rows : cl_elems),
+                  ldsa *(trans_a ? block_rows : cl_elems)>(id, ofs, s1, s2, s3,
+                                                           s4);
       }
 
       // store the output
@@ -506,8 +501,6 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
    * @tparam CType the type for joint_matrix fragment to store output
    * @param mc the computed boundary limit of m in matrix C
    * @param nc the computed boundary limit of n in matrix C
-   * @param alpha  scaling factor of AB
-   * @param beta  scaling factor of C
    * @param C  pointer to the first element of C
    * @param ldc  leading dimension of C
    * @param reg_res  joint_matrix fragment array containing the partial result
@@ -517,11 +510,11 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   template <bool check_m_limit, bool check_n_limit, typename OutputPointerType,
             typename ScratchPointerType, typename CType>
   PORTBLAS_INLINE void store_output_block(cl::sycl::nd_item<1> id, index_t mc,
-                                           index_t nc, OutputPointerType C,
-                                           ScratchPointerType scratch,
-                                           index_t ldc,
-                                           CType (&reg_res)[frags_per_sg],
-                                           const bool out_of_range) noexcept {
+                                          index_t nc, OutputPointerType C,
+                                          ScratchPointerType scratch,
+                                          index_t ldc,
+                                          CType (&reg_res)[frags_per_sg],
+                                          const bool out_of_range) noexcept {
     if (out_of_range) {
       return;
     }
@@ -533,126 +526,134 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
     Cfloat_Type float_out;
     auto sg = id.get_sub_group();
 
+    const index_t item_id = static_cast<index_t>(id.get_local_linear_id());
     const index_t sg_id = static_cast<index_t>(sg.get_group_linear_id());
-    const index_t sg_range = static_cast<index_t>(sg.get_group_linear_range());
-    const index_t sg_item_id = static_cast<index_t>(sg.get_local_linear_id());
-
-    const index_t sg_mc =
-        mc - (sg_id % jm_row_frags) * tile_type::joint_matrix_M;
-    const index_t sg_nc =
-        nc - (sg_id / jm_row_frags) * tile_type::joint_matrix_N;
-    const bool jm_store_feasible = (mc < block_rows || nc < block_cols)
-                                       ? (sg_mc >= tile_type::joint_matrix_M &&
-                                          sg_nc >= tile_type::joint_matrix_N)
-                                             ? true
-                                             : false
-                                       : true;
-
-    if (jm_store_feasible) {
-      const index_t loop_limit =
-          (tile_type::joint_matrix_M * tile_type::joint_matrix_N) / sg_size;
 
-      if constexpr (is_beta_zero) {
-        for (index_t frag = 0; frag < frags_per_sg; frag++) {
-          for (index_t i = 0; i < loop_limit; i++) {
-            element_t data_left =
-                static_cast<element_t>(get_wi_data(sg, reg_res[frag])[i]);
-            get_wi_data(sg, float_out)[i] = alpha_ * data_left;
-          }
-
-          joint_matrix_store(sg, float_out, C, ldc, layout::col_major);
-
-          C += (tile_type::joint_matrix_N * ldc);
-        }
+    const index_t output_local_store_offset =
+        (sg_id % jm_row_frags) * tile_type::joint_matrix_M +
+        (sg_id / jm_row_frags) * ldsc * tile_type::joint_matrix_N;
+    const index_t output_local_load_offset =
+        item_id % block_rows + (item_id / block_rows) * ldsc;
 
-      } else {
-        for (index_t frag = 0; frag < frags_per_sg; frag++) {
-          joint_matrix_load(sg, float_out, C, ldc, layout::col_major);
-
-          for (index_t i = 0; i < loop_limit; i++) {
-            element_t data_left =
-                static_cast<element_t>(get_wi_data(sg, reg_res[frag])[i]);
-            element_t data_right = get_wi_data(sg, float_out)[i];
-            get_wi_data(sg, float_out)[i] =
-                beta_ * data_right + alpha_ * data_left;
-          }
+    const index_t it_mod_brows = item_id % block_rows;
+    const index_t it_div_brows = item_id / block_rows;
 
-          joint_matrix_store(sg, float_out, C, ldc, layout::col_major);
+    C += (it_mod_brows + it_div_brows * ldc);
 
-          C += (tile_type::joint_matrix_N * ldc);
-        }
-      }
-      return;
-    } else if (sg_mc <= 0 || sg_nc <= 0) {
-      return;
-    }
-
-    id.barrier(cl::sycl::access::fence_space::local_space);
+    const index_t output_global_outer_offset = ldc * tile_type::joint_matrix_N;
+    constexpr index_t nc_conditional =
+        frags_per_sg > 1 ? tile_type::joint_matrix_N : block_cols;
 
-    scratch += sg_id * tile_type::joint_matrix_M;
-    const index_t sg_store_ld = sg_range * tile_type::joint_matrix_M;
-    const index_t loop_limit =
-        sg_nc >= tile_type::joint_matrix_N ? tile_type::joint_matrix_N : sg_nc;
+#pragma unroll
+    for (index_t frag = 0; frag < frags_per_sg; frag++,
+                 C += output_global_outer_offset,
+                 nc -= tile_type::joint_matrix_N) {
+      const index_t rows_per_iter =
+          nc < nc_conditional ? 1 : wg_size / block_rows;
+      const index_t output_global_inner_offset = ldc * rows_per_iter;
+      const index_t output_local_inner_offset = ldsc * rows_per_iter;
 
-    for (index_t frag = 0; frag < frags_per_sg; frag++, C += ldc * loop_limit) {
       auto new_C = C;
-      auto new_scratch = scratch;
+      auto new_scratch = scratch + output_local_load_offset;
 
-      for (index_t i = 0;
-           i <
-           (tile_type::joint_matrix_M * tile_type::joint_matrix_N) / sg_size;
-           i++) {
+#if __INTEL_LLVM_COMPILER && __INTEL_LLVM_COMPILER <= 20240002
+      constexpr index_t conv_loop_limit =
+          (tile_type::joint_matrix_M * tile_type::joint_matrix_N) / sg_size;
+      for (index_t i = 0; i < conv_loop_limit; i++) {
         get_wi_data(sg, float_out)[i] =
             static_cast<element_t>(get_wi_data(sg, reg_res[frag])[i]);
       }
+#else
+      joint_matrix_copy(sg, reg_res[frag], float_out);
+#endif
+      joint_matrix_apply(sg, float_out, [=](element_t &x) { x *= alpha_; });
+
+      id.barrier(cl::sycl::access::fence_space::local_space);
 
-      joint_matrix_store(sg, float_out, new_scratch, sg_store_ld,
-                         layout::col_major);
+      joint_matrix_store(sg, float_out, scratch + output_local_store_offset,
+                         ldsc, layout::col_major);
 
       id.barrier(cl::sycl::access::fence_space::local_space);
 
-      new_C += sg_item_id;
-      new_scratch += sg_item_id;
-
-      if (sg_mc < tile_type::joint_matrix_M &&
-          sg_nc < tile_type::joint_matrix_N) {
-        if (sg_item_id < sg_mc) {
-          for (index_t i = 0; i < loop_limit; i++) {
-            if constexpr (is_beta_zero) {
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right;
-            } else {
-              element_t data_left = *(new_C + i * ldc);
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right + beta_ * data_left;
+      if constexpr (check_m_limit && check_n_limit) {
+        if (mc >= block_rows && nc >= nc_conditional) {
+          const index_t loop_limit = nc_conditional / rows_per_iter;
+#pragma unroll
+          for (int i = 0; i < loop_limit; i++,
+                   new_C += output_global_inner_offset,
+                   new_scratch += output_local_inner_offset) {
+            if constexpr (is_beta_zero)
+              *new_C = *new_scratch;
+            else {
+              auto val = *new_C;
+              *new_C = *new_scratch + beta_ * val;
             }
           }
+          continue;
         }
-      } else if (sg_mc < tile_type::joint_matrix_M) {
-        if (sg_item_id < sg_mc) {
-          for (index_t i = 0; i < loop_limit; i++) {
-            if constexpr (is_beta_zero) {
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right;
-            } else {
-              element_t data_left = *(new_C + i * ldc);
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right + beta_ * data_left;
+        if (mc < block_rows && nc < nc_conditional) {
+          if (item_id < mc) {
+            const index_t loop_limit = nc;
+#pragma unroll
+            for (int i = 0; i < loop_limit; i++,
+                     new_C += output_global_inner_offset,
+                     new_scratch += output_local_inner_offset) {
+              if constexpr (is_beta_zero)
+                *new_C = *new_scratch;
+              else {
+                auto val = *new_C;
+                *new_C = *new_scratch + beta_ * val;
+              }
             }
           }
+          continue;
         }
-      } else {
-        if (sg_item_id < tile_type::joint_matrix_M) {
-          for (index_t i = 0; i < loop_limit; i++) {
-            if constexpr (is_beta_zero) {
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right;
-            } else {
-              element_t data_left = *(new_C + i * ldc);
-              element_t data_right = *(new_scratch + i * sg_store_ld);
-              *(new_C + i * ldc) = alpha_ * data_right + beta_ * data_left;
+        if (mc < block_rows) {
+          if (it_mod_brows < mc) {
+            const index_t loop_limit = nc_conditional / rows_per_iter;
+#pragma unroll
+            for (int i = 0; i < loop_limit; i++,
+                     new_C += output_global_inner_offset,
+                     new_scratch += output_local_inner_offset) {
+              if constexpr (is_beta_zero)
+                *new_C = *new_scratch;
+              else {
+                auto val = *new_C;
+                *new_C = *new_scratch + beta_ * val;
+              }
+            }
+          }
+          continue;
+        }
+        if (nc < nc_conditional) {
+          if (item_id < block_rows) {
+            const index_t loop_limit = nc;
+#pragma unroll
+            for (int i = 0; i < loop_limit; i++,
+                     new_C += output_global_inner_offset,
+                     new_scratch += output_local_inner_offset) {
+              if constexpr (is_beta_zero)
+                *new_C = *new_scratch;
+              else {
+                auto val = *new_C;
+                *new_C = *new_scratch + beta_ * val;
+              }
             }
           }
+          continue;
+        }
+      } else {
+        const index_t loop_limit = nc_conditional / rows_per_iter;
+#pragma unroll
+        for (int i = 0; i < loop_limit; i++,
+                 new_C += output_global_inner_offset,
+                 new_scratch += output_local_inner_offset) {
+          if constexpr (is_beta_zero)
+            *new_C = *new_scratch;
+          else {
+            auto val = *new_C;
+            *new_C = *new_scratch + beta_ * val;
+          }
         }
       }
     }
@@ -688,8 +689,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   }
 
   /*!
-   * @brief Extract a block of a matrix from global to shared memory, and
-   *        optionally transpose it on the fly.
+   * @brief Extract a block of a matrix from global to shared memory.
    *
    * This is a collective operation on all items in a work group.
    *
@@ -727,24 +727,21 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
       index_t item_id, InputPointerType ptr, index_t ld,
       ScratchPointerType scratch, RowPredicate in_row, ColPredicate in_col) {
     constexpr index_t bs = rows * cols;
-    constexpr index_t multiplier = internal ? packetize_t::packet_size : 1;
+    constexpr index_t loop_iterations = (bs - 1) / wg_size + 1;
 #pragma unroll
-    for (index_t i = 0; i < (bs - 1) / (wg_size * multiplier) + 1; ++i) {
-      if (!do_check<((bs % (wg_size * multiplier)) != 0)>(
-              item_id + i * (wg_size * multiplier) < bs))
+    for (index_t i = 0; i < loop_iterations; ++i) {
+      if (!do_check<((bs % wg_size) != 0)>(item_id + i * wg_size < bs))
         continue;
-      const index_t col_ofs = i * ((wg_size * multiplier) / rows);
+      const index_t col_ofs = i * (wg_size / rows);
       const bool in_range =
-          do_check<check_row_limit>(
-              in_row(((item_id * multiplier) % rows), multiplier - 1)) &&
-          do_check<check_col_limit>(
-              in_col((item_id * multiplier / rows), col_ofs));
+          do_check<check_row_limit>(in_row((item_id % rows), 0)) &&
+          do_check<check_col_limit>(in_col((item_id / rows), col_ofs));
 
-      packetize_t::template load<trans, internal, lds>(
+      packetize_t::template load<internal>(
           in_range, ptr + col_ofs * ld, scratch + col_ofs * lds,
           [&](const index_t &ofs) {
-            return in_row((item_id * multiplier) % rows, ofs) &&
-                   in_col((item_id * multiplier) / rows, col_ofs);
+            return in_row(item_id % rows, ofs) &&
+                   in_col(item_id / rows, col_ofs);
           });
     }
   }
@@ -756,29 +753,21 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
       index_t item_id, InputPointerType ptr, index_t ld,
       ScratchPointerType scratch, RowPredicate in_row, ColPredicate in_col) {
     constexpr index_t bs = rows * cols;
-    constexpr index_t multiplier = internal ? packetize_t::packet_size : 1;
-    constexpr index_t loop_iterations = (bs - 1) / (wg_size * multiplier) + 1;
-    constexpr index_t divisor =
-        rows > cols ? block_rows == 128 ? 2 : 4 : loop_iterations / 2;
+    constexpr index_t loop_iterations = (bs - 1) / wg_size + 1;
 #pragma unroll
     for (index_t i = 0; i < loop_iterations; ++i) {
-      if (!do_check<((bs % (wg_size * multiplier)) != 0)>(
-              item_id + i * (wg_size * multiplier) < bs))
+      if (!do_check<((bs % wg_size) != 0)>(item_id + i * wg_size < bs))
         continue;
-      const index_t local_row_ofs =
-          (i % divisor) * ((wg_size * multiplier) / cols) +
-          i / divisor * lds * cols;
-      const index_t row_ofs = i * ((wg_size * multiplier) / cols);
-      const bool in_range = do_check<check_row_limit>(in_row(
-                                (item_id * multiplier) / cols, row_ofs)) &&
-                            do_check<check_col_limit>(in_col(
-                                (item_id * multiplier) % cols, multiplier - 1));
-
-      packetize_t::template load<trans, internal, lds>(
-          in_range, ptr + row_ofs * ld, scratch + local_row_ofs,
+      const index_t row_ofs = i * (wg_size / cols);
+      const bool in_range =
+          do_check<check_row_limit>(in_row(item_id / cols, row_ofs)) &&
+          do_check<check_col_limit>(in_col(item_id % cols, 0));
+
+      packetize_t::template load<internal>(
+          in_range, ptr + row_ofs * ld, scratch + row_ofs * lds,
           [&](const index_t &ofs) PORTBLAS_ALWAYS_INLINE {
-            return in_col((item_id * multiplier) % cols, ofs) &&
-                   in_row((item_id * multiplier) / cols, row_ofs);
+            return in_col(item_id % cols, ofs) &&
+                   in_row(item_id / cols, row_ofs);
           });
     }
   }
@@ -787,49 +776,59 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
    * @brief Compute a small matrix-matrix product `reg_res += A*B`.
    *
    * @tparam InputPointerType  pointer type for A and B
+   * @tparam CType  joint_matrix accumulator fragment type
    *
-   * @param B  pointer to matrix A with proper item-dependent offset,
+   * @param s4  pointer to matrix A with proper item-dependent offset,
    *           see GemmFactory::run() for details
-   * @param A  pointer to matrix B with proper item-dependent offset,
+   * @param s2  pointer to matrix B with proper item-dependent offset,
    *           see GemmFactory::run() for details
    */
-  template <bool check_m_limit, bool check_n_limit, typename InputPointerType,
-            typename CType>
+  template <typename InputPointerType, typename CType>
   PORTBLAS_INLINE void compute_block_gemm(
       const cl::sycl::nd_item<1> &id, InputPointerType s2, InputPointerType s4,
       CType (&reg_res)[frags_per_sg]) noexcept {
     using namespace cl::sycl::ext::oneapi::experimental::matrix;
+    constexpr layout pattern_a =
+        trans_a ? layout::row_major : layout::col_major;
+    constexpr layout pattern_b =
+        trans_b ? layout::row_major : layout::col_major;
     using AType =
         joint_matrix<cl::sycl::sub_group, typename tile_type::jmInpType, use::a,
                      tile_type::joint_matrix_M, tile_type::joint_matrix_K,
-                     layout::col_major>;
+                     pattern_a>;
     using BType =
         joint_matrix<cl::sycl::sub_group, typename tile_type::jmInpType, use::b,
                      tile_type::joint_matrix_K, tile_type::joint_matrix_N,
-                     layout::col_major>;
-
-    AType inA;
-    BType inB;
+                     pattern_b>;
 
     const index_t strideA = ldsa;
     const index_t strideB = ldsb;
 
     auto sg = id.get_sub_group();
+    constexpr index_t loop_limit = cl_elems / tile_type::joint_matrix_K;
 
 #pragma unroll
-    for (index_t frag = 0; frag < frags_per_sg; frag++) {
-      auto new_B = s2 + frag * tile_type::joint_matrix_N * ldsb;
-      auto new_A = s4;
+    for (index_t i = 0; i < loop_limit; i++) {
+      auto new_B = s2;
+      AType inA;
 
-      for (index_t i = 0; i < cl_elems / tile_type::joint_matrix_K; i++) {
-        joint_matrix_load(sg, inA, new_A, strideA);  // M
-        joint_matrix_load(sg, inB, new_B, strideB);  // N
+      joint_matrix_load(sg, inA, s4, strideA);  // M
 
+      for (index_t frag = 0; frag < frags_per_sg; frag++) {
+        BType inB;
+        joint_matrix_load(sg, inB, new_B, strideB);  // N
+#if __INTEL_LLVM_COMPILER && __INTEL_LLVM_COMPILER <= 20240002
         reg_res[frag] = joint_matrix_mad(sg, inA, inB, reg_res[frag]);
-
-        new_A += ldsa * tile_type::joint_matrix_K;
-        new_B += ldsb * block_cols;
+#else
+        joint_matrix_mad(sg, reg_res[frag], inA, inB, reg_res[frag]);
+#endif
+        new_B += (trans_b ? tile_type::joint_matrix_N
+                          : tile_type::joint_matrix_N * ldsb);
       }
+      s4 += (trans_a ? tile_type::joint_matrix_K
+                     : tile_type::joint_matrix_K * strideA);
+      s2 += (trans_b ? tile_type::joint_matrix_K * strideB
+                     : tile_type::joint_matrix_K);
     }
   }
 
@@ -852,7 +851,7 @@ class Gemm<input_t, output_t, DoubleBuffer, NbcA, NbcB, ClSize, TileType,
   template <bool db, index_t o, index_t... os, typename P, typename... Ps>
   static PORTBLAS_INLINE typename std::enable_if<db>::type sync_smem(
       const cl::sycl::nd_item<1> &id, index_t &ofs_sign, P &s,
-      Ps &... ss) noexcept {
+      Ps &...ss) noexcept {
     s += ofs_sign * o;
     sync_smem<db, os...>(id, ofs_sign, ss...);
   }
diff --git a/test/blas_test.hpp b/test/blas_test.hpp
index 5d0193518..6c31a4db0 100644
--- a/test/blas_test.hpp
+++ b/test/blas_test.hpp
@@ -226,6 +226,35 @@ static inline void fill_trsm_matrix(std::vector<scalar_t> &A, size_t k,
   }
 }
 
+/**
+ * @brief Set to zero the last n bits of a float.
+ * @tparam T value type.
+ * @param val input/output float value.
+ * @param nbits number of last bit set to zero. It is set by default to 13 since
+ * this is the difference of the number of bits of the mantissa between floats
+ * (23) and FP16 / NVIDIA TF32 (10). For bfloat16, this value needs to be set to
+ * 16 to get correct result.
+ */
+template <typename T>
+void set_to_zero_last_nbits(T &val, int32_t nbits = 13) {
+  static_assert(sizeof(T) <= 64);
+  using integer_t =
+      std::conditional_t<sizeof(T) == 64, int64_t,
+                          int32_t>;
+  integer_t *int_pntr = reinterpret_cast<integer_t *>(&val);
+}
+
+/**
+ * @brief Set to zero the last n bits of floats contained in a vector.
+ * @tparam T value type.
+ * @param val input/output float vector.
+ * @param nbits number of last bit set to zero.
+ */
+template <typename T>
+void set_to_zero_last_nbits(std::vector<T> &vec, int32_t nbits = 13) {
+  for (T &val : vec) set_to_zero_last_nbits(val, nbits);
+}
+
 /**
  * @brief Helper class for dumping arguments to a stream, in a format compatible
  * with google test test names.
diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt
index a871040d4..7b398a2a7 100644
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@@ -152,3 +152,21 @@ foreach(blas_test ${SYCL_UNITTEST_SRCS})
       COMPONENT tests
   )
 endforeach()
+
+if(${ENABLE_JOINTMATRIX_TESTS})
+  if (${DPCPP_SYCL_TARGET} STREQUAL "nvptx64-nvidia-cuda")
+    string(FIND ${DPCPP_SYCL_ARCH} "_" start_idx)
+    if(start_idx)
+      MATH(EXPR start_idx "${start_idx} + 1")
+      string(SUBSTRING ${DPCPP_SYCL_ARCH} ${start_idx} "2" sm_val)
+    endif()
+
+    if (${start_idx} AND ${sm_val} GREATER_EQUAL "80")
+      add_subdirectory(joint_matrix)
+    else()
+      message(FATAL_ERROR "Joint Matrix Tests only supported for NVIDIA GPUs with sm_80 arch and above.")
+    endif()
+  else()
+    message(FATAL_ERROR "Joint Matrix Tests only supported for NVIDIA GPUs.")
+  endif()
+endif()
diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp
index 5308d70e5..3f06d58ec 100644
--- a/test/unittest/blas3/blas3_gemm_common.hpp
+++ b/test/unittest/blas3/blas3_gemm_common.hpp
@@ -135,6 +135,17 @@ inline void verify_gemm(const gemm_arguments_t<scalar_t> arguments) {
   fill_random(a_m);
   fill_random(b_m);
   fill_random(c_m_gpu);
+
+  const char* en_joint_matrix = std::getenv("SB_ENABLE_JOINT_MATRIX");
+  if (en_joint_matrix != NULL && std::is_same<scalar_t, float>::value &&
+      *en_joint_matrix == '1') {
+    set_to_zero_last_nbits(a_m);
+    set_to_zero_last_nbits(b_m);
+    set_to_zero_last_nbits(c_m_gpu);
+    set_to_zero_last_nbits(alpha);
+    set_to_zero_last_nbits(beta);
+  }
+
   std::vector<scalar_t> c_m_cpu = c_m_gpu;
 
   // Use system blas to create a reference output
@@ -302,6 +313,17 @@ inline void verify_gemm(
   fill_random(a_m);
   fill_random(b_m);
   fill_random(c_m_gpu);
+
+  const char* en_joint_matrix = std::getenv("SB_ENABLE_JOINT_MATRIX");
+  if (en_joint_matrix != NULL && std::is_same<scalar_t, float>::value &&
+      *en_joint_matrix == '1') {
+    set_to_zero_last_nbits(a_m);
+    set_to_zero_last_nbits(b_m);
+    set_to_zero_last_nbits(c_m_gpu);
+    set_to_zero_last_nbits(alpha);
+    set_to_zero_last_nbits(beta);
+  }
+
   std::vector<scalar_t> c_m_cpu = c_m_gpu;
 
   // Use system blas to create a reference output
diff --git a/test/unittest/blas3/blas3_trsm_test.cpp b/test/unittest/blas3/blas3_trsm_test.cpp
index 793bd7ee5..6f44dab93 100644
--- a/test/unittest/blas3/blas3_trsm_test.cpp
+++ b/test/unittest/blas3/blas3_trsm_test.cpp
@@ -62,6 +62,14 @@ void run_test(const combination_t<scalar_t> combi) {
                    static_cast<scalar_t>(unusedValue));
   fill_random(B);
 
+  const char* en_joint_matrix = std::getenv("SB_ENABLE_JOINT_MATRIX");
+  if (en_joint_matrix != NULL && std::is_same<scalar_t, float>::value &&
+      *en_joint_matrix == '1') {
+    set_to_zero_last_nbits(A);
+    set_to_zero_last_nbits(B);
+    set_to_zero_last_nbits(alpha);
+  }
+
   // Create a copy of B to calculate the reference outputs
   cpu_B = B;
   reference_blas::trsm(&side, &uplo, &trans, &diag, m, n,
@@ -84,7 +92,12 @@ void run_test(const combination_t<scalar_t> combi) {
       blas::helper::copy_to_host<scalar_t>(q, b_gpu, B.data(), B.size());
   sb_handle.wait(event);
 
-  bool isAlmostEqual = utils::compare_vectors(cpu_B, B);
+  bool isAlmostEqual = utils::compare_vectors(
+      cpu_B, B, std::cerr, "\n",
+      (en_joint_matrix != NULL) && (std::is_same<scalar_t, float>::value) &&
+              (*en_joint_matrix == '1')
+          ? 3
+          : 1);
 
   ASSERT_TRUE(isAlmostEqual);
 
diff --git a/test/unittest/joint_matrix/CMakeLists.txt b/test/unittest/joint_matrix/CMakeLists.txt
new file mode 100644
index 000000000..efba7944c
--- /dev/null
+++ b/test/unittest/joint_matrix/CMakeLists.txt
@@ -0,0 +1,74 @@
+#/***************************************************************************
+# *
+# *  @license
+# *  Copyright (C) Codeplay Software Limited
+# *  Licensed under the Apache License, Version 2.0 (the "License");
+# *  you may not use this file except in compliance with the License.
+# *  You may obtain a copy of the License at
+# *
+# *      http://www.apache.org/licenses/LICENSE-2.0
+# *
+# *  For your convenience, a copy of the License has been included in this
+# *  repository.
+# *
+# *  Unless required by applicable law or agreed to in writing, software
+# *  distributed under the License is distributed on an "AS IS" BASIS,
+# *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# *  See the License for the specific language governing permissions and
+# *  limitations under the License.
+# *
+# *  portBLAS: BLAS implementation using SYCL
+# *
+# *  @filename CMakeLists.txt
+# *
+# **************************************************************************/
+
+set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../include)
+set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../src)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules)
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+include(ConfigurePORTBLAS)
+include(SYCL)
+find_package(PORTBLAS REQUIRED)
+
+set(PORTBLAS_JOINTMATRIX_TEST ${CMAKE_CURRENT_SOURCE_DIR})
+
+include_directories(${PORTBLAS_TEST} ${BLAS_INCLUDE_DIRS})
+
+# compiling tests
+set(SYCL_UNITTEST_SRCS
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_half_16_16_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_half_32_8_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_half_8_32_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_float_16_16_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_float_32_8_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/half_float_8_32_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/bfloat16_float_16_16_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/bfloat16_float_32_8_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/bfloat16_float_8_32_16.cpp
+  ${PORTBLAS_JOINTMATRIX_TEST}/tf32_float_16_16_8.cpp
+)
+
+foreach(blas_test ${SYCL_UNITTEST_SRCS})
+  get_filename_component(test_exec ${blas_test} NAME_WE)
+  add_executable(joint_matrix_${test_exec}_test ../main.cpp ${blas_test})
+  target_compile_definitions(joint_matrix_${test_exec}_test PRIVATE -DBLAS_INDEX_T=${BLAS_TEST_INDEX_TYPE})
+  target_link_libraries(joint_matrix_${test_exec}_test PRIVATE gtest_main Clara::Clara blas::blas PORTBLAS::PORTBLAS)
+  target_include_directories(joint_matrix_${test_exec}_test PRIVATE ${SYCL_INCLUDE_DIRS})
+  target_include_directories(joint_matrix_${test_exec}_test PRIVATE ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR})
+  target_compile_options(joint_matrix_${test_exec}_test PRIVATE ${DPCPP_FLAGS})
+  target_link_options(joint_matrix_${test_exec}_test PRIVATE ${DPCPP_FLAGS})
+
+  if(TEST_DEVICE)
+    add_test(NAME joint_matrix_${test_exec}_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/joint_matrix_${test_exec}_test --device ${TEST_DEVICE} --gtest_output=xml:output/)
+  else()
+    add_test(NAME joint_matrix_${test_exec}_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/joint_matrix_${test_exec}_test --gtest_output=xml:output/)
+  endif()
+  message(STATUS "Created google test joint_matrix_${test_exec}_test")
+  install(TARGETS joint_matrix_${test_exec}_test
+    RUNTIME
+      DESTINATION ${CMAKE_INSTALL_BINDIR}
+      COMPONENT tests
+  )
+endforeach()
diff --git a/test/unittest/joint_matrix/bfloat16_float_16_16_16.cpp b/test/unittest/joint_matrix/bfloat16_float_16_16_16.cpp
new file mode 100644
index 000000000..ba4d35f7b
--- /dev/null
+++ b/test/unittest/joint_matrix/bfloat16_float_16_16_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename bfloat16_float_16_16_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesBfloat16Floatm16n16k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesBfloat16Floatm16n16k16);
+
+template <typename scalar_t>
+const auto MediumMatricesBfloat16Floatm16n16k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesBfloat16Floatm16n16k16);
+
+template <typename scalar_t>
+const auto LargeMatricesBfloat16Floatm16n16k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesBfloat16Floatm16n16k16);
diff --git a/test/unittest/joint_matrix/bfloat16_float_32_8_16.cpp b/test/unittest/joint_matrix/bfloat16_float_32_8_16.cpp
new file mode 100644
index 000000000..49cc187ff
--- /dev/null
+++ b/test/unittest/joint_matrix/bfloat16_float_32_8_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename bfloat16_float_32_8_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesBfloat16Floatm32n8k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesBfloat16Floatm32n8k16);
+
+template <typename scalar_t>
+const auto MediumMatricesBfloat16Floatm32n8k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesBfloat16Floatm32n8k16);
+
+template <typename scalar_t>
+const auto LargeMatricesBfloat16Floatm32n8k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesBfloat16Floatm32n8k16);
diff --git a/test/unittest/joint_matrix/bfloat16_float_8_32_16.cpp b/test/unittest/joint_matrix/bfloat16_float_8_32_16.cpp
new file mode 100644
index 000000000..6583fd450
--- /dev/null
+++ b/test/unittest/joint_matrix/bfloat16_float_8_32_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename bfloat16_float_8_32_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesBfloat16Floatm8n32k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesBfloat16Floatm8n32k16);
+
+template <typename scalar_t>
+const auto MediumMatricesBfloat16Floatm8n32k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesBfloat16Floatm8n32k16);
+
+template <typename scalar_t>
+const auto LargeMatricesBfloat16Floatm8n32k16 = ::testing::Combine(
+    ::testing::Values("bfloat16"),                 // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesBfloat16Floatm8n32k16);
diff --git a/test/unittest/joint_matrix/cmake/FindPORTBLAS.cmake b/test/unittest/joint_matrix/cmake/FindPORTBLAS.cmake
new file mode 100644
index 000000000..f102b0247
--- /dev/null
+++ b/test/unittest/joint_matrix/cmake/FindPORTBLAS.cmake
@@ -0,0 +1,65 @@
+#/***************************************************************************
+# *
+# *  @license
+# *  Copyright (C) Codeplay Software Limited
+# *  Licensed under the Apache License, Version 2.0 (the "License");
+# *  you may not use this file except in compliance with the License.
+# *  You may obtain a copy of the License at
+# *
+# *      http://www.apache.org/licenses/LICENSE-2.0
+# *
+# *  For your convenience, a copy of the License has been included in this
+# *  repository.
+# *
+# *  Unless required by applicable law or agreed to in writing, software
+# *  distributed under the License is distributed on an "AS IS" BASIS,
+# *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# *  See the License for the specific language governing permissions and
+# *  limitations under the License.
+# *
+# *  portBLAS: BLAS implementation using SYCL
+# *
+# *  @filename FindPORTBLAS.cmake
+# *
+# **************************************************************************/
+
+find_path(PORTBLAS_INCLUDE_DIR
+  NAMES portblas.h
+  PATH_SUFFIXES include
+  HINTS ${PORTBLAS_DIR}
+  DOC "The PORTBLAS include directory"
+)
+
+find_path(PORTBLAS_SRC_DIR
+  NAMES portblas.hpp
+  PATH_SUFFIXES src
+  HINTS ${PORTBLAS_DIR}
+  DOC "The PORTBLAS source directory"
+)
+
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PORTBLAS
+  FOUND_VAR PORTBLAS_FOUND
+  REQUIRED_VARS PORTBLAS_INCLUDE_DIR
+                PORTBLAS_SRC_DIR
+)
+
+mark_as_advanced(PORTBLAS_FOUND
+                 PORTBLAS_SRC_DIR
+                 PORTBLAS_INCLUDE_DIR
+)
+
+if(PORTBLAS_FOUND)
+  set(PORTBLAS_INCLUDE_DIRS
+    ${PORTBLAS_INCLUDE_DIR}
+    ${PORTBLAS_SRC_DIR}
+  )
+endif()
+
+if(PORTBLAS_FOUND AND NOT TARGET PORTBLAS::PORTBLAS)
+  add_library(PORTBLAS::PORTBLAS INTERFACE IMPORTED)
+  set_target_properties(PORTBLAS::PORTBLAS PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${PORTBLAS_INCLUDE_DIRS}"
+  )
+endif()
diff --git a/test/unittest/joint_matrix/half_float_16_16_16.cpp b/test/unittest/joint_matrix/half_float_16_16_16.cpp
new file mode 100644
index 000000000..88b3bac8b
--- /dev/null
+++ b/test/unittest/joint_matrix/half_float_16_16_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_float_16_16_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfFloat161616 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfFloat161616);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfFloat161616 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfFloat161616);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfFloat161616 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfFloat161616);
diff --git a/test/unittest/joint_matrix/half_float_32_8_16.cpp b/test/unittest/joint_matrix/half_float_32_8_16.cpp
new file mode 100644
index 000000000..b370e67f2
--- /dev/null
+++ b/test/unittest/joint_matrix/half_float_32_8_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_float_32_8_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfFloatm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfFloatm32n8k16);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfFloatm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfFloatm32n8k16);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfFloatm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfFloatm32n8k16);
diff --git a/test/unittest/joint_matrix/half_float_8_32_16.cpp b/test/unittest/joint_matrix/half_float_8_32_16.cpp
new file mode 100644
index 000000000..9853332ae
--- /dev/null
+++ b/test/unittest/joint_matrix/half_float_8_32_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_float_8_32_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfFloatm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfFloatm8n32k16);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfFloatm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfFloatm8n32k16);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfFloatm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfFloatm8n32k16);
diff --git a/test/unittest/joint_matrix/half_half_16_16_16.cpp b/test/unittest/joint_matrix/half_half_16_16_16.cpp
new file mode 100644
index 000000000..58241fca0
--- /dev/null
+++ b/test/unittest/joint_matrix/half_half_16_16_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_half_16_16_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfHalfm16n16k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfHalfm16n16k16);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfHalfm16n16k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfHalfm16n16k16);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfHalfm16n16k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfHalfm16n16k16);
diff --git a/test/unittest/joint_matrix/half_half_32_8_16.cpp b/test/unittest/joint_matrix/half_half_32_8_16.cpp
new file mode 100644
index 000000000..e220b884b
--- /dev/null
+++ b/test/unittest/joint_matrix/half_half_32_8_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_half_32_8_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfHalfm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfHalfm32n8k16);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfHalfm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfHalfm32n8k16);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfHalfm32n8k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(32),                         // jm_m
+    ::testing::Values(8),                          // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfHalfm32n8k16);
diff --git a/test/unittest/joint_matrix/half_half_8_32_16.cpp b/test/unittest/joint_matrix/half_half_8_32_16.cpp
new file mode 100644
index 000000000..94a4ebeaf
--- /dev/null
+++ b/test/unittest/joint_matrix/half_half_8_32_16.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename half_half_8_32_16.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesHalfHalfm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesHalfHalfm8n32k16);
+
+template <typename scalar_t>
+const auto MediumMatricesHalfHalfm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesHalfHalfm8n32k16);
+
+template <typename scalar_t>
+const auto LargeMatricesHalfHalfm8n32k16 = ::testing::Combine(
+    ::testing::Values("half"),                     // input type
+    ::testing::Values("half"),                     // output type
+    ::testing::Values(8),                          // jm_m
+    ::testing::Values(32),                         // jm_n
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesHalfHalfm8n32k16);
diff --git a/test/unittest/joint_matrix/joint_matrix_common.hpp b/test/unittest/joint_matrix/joint_matrix_common.hpp
new file mode 100644
index 000000000..c18366a83
--- /dev/null
+++ b/test/unittest/joint_matrix/joint_matrix_common.hpp
@@ -0,0 +1,268 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename joint_matrix_common.hpp
+ *
+ **************************************************************************/
+
+#include "launch_gemm.hpp"
+
+template <typename T>
+using joint_matrix_arguments_t =
+    std::tuple<std::string, std::string, index_t, index_t, index_t, std::string,
+               index_t, index_t, index_t, index_t, index_t, char, char, T, T,
+               index_t, index_t, index_t, gemm_batch_type_t>;
+
+template <typename scalar_t, helper::AllocType mem_alloc>
+inline void verify_gemm(const joint_matrix_arguments_t<scalar_t> arguments) {
+  std::string jm_inType, jm_outType;
+  index_t jm_m, jm_n, jm_k;
+  std::string alloc;
+  index_t offset;
+  index_t batch;
+  index_t m;
+  index_t n;
+  index_t k;
+  char transa;
+  char transb;
+  scalar_t alpha;
+  scalar_t beta;
+  index_t lda_mul;
+  index_t ldb_mul;
+  index_t ldc_mul;
+  gemm_batch_type_t batch_type;
+  std::tie(jm_inType, jm_outType, jm_m, jm_n, jm_k, alloc, offset, batch, m, n,
+           k, transa, transb, alpha, beta, lda_mul, ldb_mul, ldc_mul,
+           batch_type) = arguments;
+
+  assert(batch_type == gemm_batch_type_t::strided);
+
+  const char ta_str[2] = {transa, '\0'};
+  const char tb_str[2] = {transb, '\0'};
+
+  auto q = make_queue();
+  blas::SB_Handle sb_handle(q);
+
+  const index_t lda = ((transa != 'n') ? k : m) * lda_mul;
+  const index_t ldb = ((transb != 'n') ? n : k) * ldb_mul;
+  const index_t ldc = m * ldc_mul;
+
+  const index_t size_a = m * k * lda_mul;
+  const index_t size_b = k * n * ldb_mul;
+  const index_t size_c = m * n * ldc_mul;
+
+  const index_t buffer_size_a = batch * size_a + offset;
+  const index_t buffer_size_b = batch * size_b + offset;
+  const index_t buffer_size_c = batch * size_c + offset;
+
+  std::vector<scalar_t> a_m(buffer_size_a);
+  std::vector<scalar_t> b_m(buffer_size_b);
+  std::vector<scalar_t> c_m_gpu(buffer_size_c);
+
+  if (jm_outType == "half") {
+    // initialize the vectors with positive values
+    // to avoid test failures for half precision
+    // accumulation
+    fill_random_with_range(a_m, scalar_t{1}, scalar_t{2});
+    fill_random_with_range(b_m, scalar_t{1}, scalar_t{2});
+    fill_random_with_range(c_m_gpu, scalar_t{1}, scalar_t{2});
+  } else {
+    fill_random(a_m);
+    fill_random(b_m);
+    fill_random(c_m_gpu);
+  }
+
+  index_t nbits = 13;
+  if (jm_inType == "bfloat16") {
+    nbits = 16;
+  }
+  set_to_zero_last_nbits(a_m, nbits);
+  set_to_zero_last_nbits(b_m, nbits);
+  set_to_zero_last_nbits(c_m_gpu, nbits);
+  set_to_zero_last_nbits(alpha, nbits);
+  set_to_zero_last_nbits(beta, nbits);
+
+  std::vector<scalar_t> c_m_cpu = c_m_gpu;
+
+  // Use system blas to create a reference output
+  for (int i = 0; i < batch; ++i) {
+    reference_blas::gemm(ta_str, tb_str, m, n, k, alpha,
+                         a_m.data() + i * size_a + offset, lda,
+                         b_m.data() + i * size_b + offset, ldb, beta,
+                         c_m_cpu.data() + i * size_c + offset, ldc);
+  }
+
+  auto m_a_gpu = blas::helper::allocate<mem_alloc, scalar_t>(buffer_size_a, q);
+  auto m_b_gpu = blas::helper::allocate<mem_alloc, scalar_t>(buffer_size_b, q);
+  auto m_c_gpu = blas::helper::allocate<mem_alloc, scalar_t>(buffer_size_c, q);
+
+  auto copy_a =
+      blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, buffer_size_a);
+  auto copy_b =
+      blas::helper::copy_to_device(q, b_m.data(), m_b_gpu, buffer_size_b);
+  auto copy_c =
+      blas::helper::copy_to_device(q, c_m_gpu.data(), m_c_gpu, buffer_size_c);
+
+  // portBLAS GEMM implementation
+  typename blas::SB_Handle::event_t gemm_event;
+  if (jm_inType == "half" && jm_outType == "float") {
+    if (jm_m == 16 && jm_n == 16) {
+      gemm_event = launch_gemm_with_beta<16, 16, 16, cl::sycl::half, float>(
+          sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+          size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc,
+          size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_m == 32 && jm_n == 8) {
+      gemm_event = launch_gemm_with_beta<32, 8, 16, cl::sycl::half, float>(
+          sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+          size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc,
+          size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_n == 32 && jm_m == 8) {
+      gemm_event = launch_gemm_with_beta<8, 32, 16, cl::sycl::half, float>(
+          sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+          size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc,
+          size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    }
+  } else if (jm_inType == "half" && jm_outType == "half") {
+    if (jm_m == 16 && jm_n == 16) {
+      gemm_event =
+          launch_gemm_with_beta<16, 16, 16, cl::sycl::half, cl::sycl::half>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_m == 32 && jm_n == 8) {
+      gemm_event =
+          launch_gemm_with_beta<32, 8, 16, cl::sycl::half, cl::sycl::half>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_n == 32 && jm_m == 8) {
+      gemm_event =
+          launch_gemm_with_beta<8, 32, 16, cl::sycl::half, cl::sycl::half>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    }
+  } else if (jm_inType == "bfloat16" && jm_outType == "float") {
+    if (jm_m == 16 && jm_n == 16) {
+      gemm_event =
+          launch_gemm_with_beta<16, 16, 16, sycl::ext::oneapi::bfloat16, float>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_m == 32 && jm_n == 8) {
+      gemm_event =
+          launch_gemm_with_beta<32, 8, 16, sycl::ext::oneapi::bfloat16, float>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    } else if (jm_n == 32 && jm_m == 8) {
+      gemm_event =
+          launch_gemm_with_beta<8, 32, 16, sycl::ext::oneapi::bfloat16, float>(
+              sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+              size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset,
+              ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+    }
+  } else if (jm_inType == "tf32" && jm_outType == "float") {
+    using namespace sycl::ext::oneapi::experimental::matrix;
+    gemm_event = launch_gemm_with_beta<16, 16, 8, precision::tf32, float>(
+        sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda,
+        size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc,
+        size_c, batch, batch_type, {copy_a, copy_b, copy_c});
+  }
+  sb_handle.wait(gemm_event);
+
+  auto event =
+      blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), buffer_size_c);
+  sb_handle.wait(event);
+
+  const bool isAlmostEqual = utils::compare_vectors(
+      c_m_gpu, c_m_cpu, std::cerr, "\n", jm_outType == "half" ? 3 : 1);
+  ASSERT_TRUE(isAlmostEqual);
+
+  helper::deallocate<mem_alloc>(m_a_gpu, q);
+  helper::deallocate<mem_alloc>(m_b_gpu, q);
+  helper::deallocate<mem_alloc>(m_c_gpu, q);
+}
+
+template <typename scalar_t>
+inline void verify_gemm(const joint_matrix_arguments_t<scalar_t> arguments) {
+  std::string jm_inType, jm_OutType;
+  index_t jm_m, jm_n, jm_k;
+  std::string alloc;
+  index_t offset;
+  index_t batch;
+  index_t m;
+  index_t n;
+  index_t k;
+  char transa;
+  char transb;
+  scalar_t alpha;
+  scalar_t beta;
+  index_t lda_mul;
+  index_t ldb_mul;
+  index_t ldc_mul;
+  gemm_batch_type_t batch_type;
+  std::tie(jm_inType, jm_OutType, jm_m, jm_n, jm_k, alloc, offset, batch, m, n,
+           k, transa, transb, alpha, beta, lda_mul, ldb_mul, ldc_mul,
+           batch_type) = arguments;
+
+  if (alloc == "usm") {
+#ifdef SB_ENABLE_USM
+    verify_gemm<scalar_t, helper::AllocType::usm>(arguments);
+#else
+    GTEST_SKIP();
+#endif
+  } else {
+    verify_gemm<scalar_t, helper::AllocType::buffer>(arguments);
+  }
+}
+
+template <>
+inline void dump_arg<gemm_batch_type_t>(std::ostream& ss,
+                                        gemm_batch_type_t batch_type) {
+  ss << (int)batch_type;
+}
+
+template <class T>
+static std::string generate_name(
+    const ::testing::TestParamInfo<joint_matrix_arguments_t<T>>& info) {
+  std::string jm_inType, jm_OutType;
+  int jm_m, jm_n, jm_k;
+  std::string alloc;
+  int offset, batch, m, n, k, ldaMul, ldbMul, ldcMul;
+  char transa, transb;
+  T alpha, beta;
+  gemm_batch_type_t batchType;
+  BLAS_GENERATE_NAME(info.param, jm_inType, jm_OutType, jm_m, jm_n, jm_k, alloc,
+                     offset, batch, m, n, k, transa, transb, alpha, beta,
+                     ldaMul, ldbMul, ldcMul, batchType);
+}
+
+/** Registers Joint Matrix test for all supported data types (only float for
+ * now)
+ * @param test_suite Name of the test suite
+ * @param combination Combinations object
+ * @see BLAS_REGISTER_TEST_CUSTOM_NAME
+ */
+#define GENERATE_JOINTMATRIX_TEST(test_suite, combination)                    \
+  BLAS_REGISTER_TEST_FLOAT_CUSTOM_NAME(test_suite, test_suite##combination,   \
+                                       verify_gemm, joint_matrix_arguments_t, \
+                                       combination, generate_name);
diff --git a/test/unittest/joint_matrix/launch_gemm.hpp b/test/unittest/joint_matrix/launch_gemm.hpp
new file mode 100644
index 000000000..afab6a0b3
--- /dev/null
+++ b/test/unittest/joint_matrix/launch_gemm.hpp
@@ -0,0 +1,247 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename launch_gemm.hpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "interface/gemm_launcher.hpp"
+#include "portblas.hpp"
+#include <utility>
+
+template <bool _t_a, bool _t_b, bool s_a, bool s_b, int jm_M, int jm_N,
+          int jm_K, typename inp_jmT, typename out_jmT, bool is_beta_zero,
+          typename sb_handle_t, typename index_t, typename element_t,
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+PORTBLAS_ALWAYS_INLINE
+    typename std::enable_if<jm_M == 16, typename sb_handle_t::event_t>::type
+    launch_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
+                element_t _alpha, container_0_t _a, index_t _lda,
+                index_t _stridea, container_1_t _b, index_t _ldb,
+                index_t _strideb, element_t _beta, container_2_t _c,
+                index_t _ldc, index_t _stridec, index_t batch_size,
+                gemm_batch_type_t batch_type,
+                const typename sb_handle_t::event_t& _dependencies) {
+  if (_M > 1024 && _N > 1024) {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 256, false, true, true,
+        128,
+        Tile<8, 8, 16, 16, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  } else if (_M > 64 && _N > 64) {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, true, true,
+        128,
+        Tile<4, 8, 16, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+
+  } else {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, true, true,
+        128,
+        Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  }
+}
+
+template <bool _t_a, bool _t_b, bool s_a, bool s_b, int jm_M, int jm_N,
+          int jm_K, typename inp_jmT, typename out_jmT, bool is_beta_zero,
+          typename sb_handle_t, typename index_t, typename element_t,
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+PORTBLAS_ALWAYS_INLINE
+    typename std::enable_if<jm_M == 32, typename sb_handle_t::event_t>::type
+    launch_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
+                element_t _alpha, container_0_t _a, index_t _lda,
+                index_t _stridea, container_1_t _b, index_t _ldb,
+                index_t _strideb, element_t _beta, container_2_t _c,
+                index_t _ldc, index_t _stridec, index_t batch_size,
+                gemm_batch_type_t batch_type,
+                const typename sb_handle_t::event_t& _dependencies) {
+  if (_M > 1024 && _N > 1024) {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, true, true,
+        128,
+        Tile<8, 16, 16, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  } else if (_M > 64 && _N > 64) {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 64, false, true, true, 128,
+        Tile<8, 8, 8, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT, out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+
+  } else {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, true, true,
+        128,
+        Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  }
+}
+
+template <bool _t_a, bool _t_b, bool s_a, bool s_b, int jm_M, int jm_N,
+          int jm_K, typename inp_jmT, typename out_jmT, bool is_beta_zero,
+          typename sb_handle_t, typename index_t, typename element_t,
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+PORTBLAS_ALWAYS_INLINE
+    typename std::enable_if<jm_M == 8, typename sb_handle_t::event_t>::type
+    launch_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
+                element_t _alpha, container_0_t _a, index_t _lda,
+                index_t _stridea, container_1_t _b, index_t _ldb,
+                index_t _strideb, element_t _beta, container_2_t _c,
+                index_t _ldc, index_t _stridec, index_t batch_size,
+                gemm_batch_type_t batch_type,
+                const typename sb_handle_t::event_t& _dependencies) {
+  if (_M > 1024 && _N > 1024) {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 256, false, true, true,
+        128,
+        Tile<4, 4, 16, 16, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  } else {
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, true, true,
+        128,
+        Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, jm_M, jm_N, jm_K, inp_jmT,
+             out_jmT>,
+        _t_a, _t_b, s_a, s_b, static_cast<int>(gemm_memory_t::local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::none), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided),
+        true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                     _stridea, _b, _ldb, _strideb, _beta, _c,
+                                     _ldc, _stridec, batch_size, _dependencies);
+  }
+}
+
+template <int jm_M, int jm_N, int jm_K, typename inp_jmT, typename out_jmT,
+          bool is_beta_zero, typename sb_handle_t, typename container_0_t,
+          typename container_1_t, typename container_2_t, typename element_t,
+          typename index_t>
+PORTBLAS_ALWAYS_INLINE typename sb_handle_t::event_t launch_gemm_with_transpose(
+    sb_handle_t& sb_handle, char _trans_a, char _trans_b, index_t _M,
+    index_t _N, index_t _K, element_t _alpha, container_0_t _a, index_t _lda,
+    index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb,
+    element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec,
+    index_t batch_size, gemm_batch_type_t batch_type,
+    const typename sb_handle_t::event_t& _dependencies) {
+  typename sb_handle_t::event_t gemm_event;
+  if (_trans_a == 't' && _trans_b == 't') {
+    gemm_event = launch_gemm<true, true, false, false, jm_M, jm_N, jm_K,
+                             inp_jmT, out_jmT, is_beta_zero>(
+        sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb,
+        _beta, _c, _ldc, _stridec, batch_size, batch_type, _dependencies);
+  } else if (_trans_a == 'n' && _trans_b == 'n') {
+    gemm_event = launch_gemm<false, false, false, false, jm_M, jm_N, jm_K,
+                             inp_jmT, out_jmT, is_beta_zero>(
+        sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb,
+        _beta, _c, _ldc, _stridec, batch_size, batch_type, _dependencies);
+  } else if (_trans_a == 't' && _trans_b == 'n') {
+    gemm_event = launch_gemm<true, false, false, false, jm_M, jm_N, jm_K,
+                             inp_jmT, out_jmT, is_beta_zero>(
+        sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb,
+        _beta, _c, _ldc, _stridec, batch_size, batch_type, _dependencies);
+  } else if (_trans_a == 'n' && _trans_b == 't') {
+    gemm_event = launch_gemm<false, true, false, false, jm_M, jm_N, jm_K,
+                             inp_jmT, out_jmT, is_beta_zero>(
+        sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb,
+        _beta, _c, _ldc, _stridec, batch_size, batch_type, _dependencies);
+  }
+  return gemm_event;
+}
+
+template <int jm_M, int jm_N, int jm_K, typename inp_jmT, typename out_jmT,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename container_2_t, typename element_t, typename index_t>
+PORTBLAS_ALWAYS_INLINE typename sb_handle_t::event_t launch_gemm_with_beta(
+    sb_handle_t& sb_handle, char _trans_a, char _trans_b, index_t _M,
+    index_t _N, index_t _K, element_t _alpha, container_0_t _a, index_t _lda,
+    index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb,
+    element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec,
+    index_t batch_size, gemm_batch_type_t batch_type,
+    const typename sb_handle_t::event_t& _dependencies) {
+  typename sb_handle_t::event_t gemm_event;
+  if (_beta == (element_t)0) {
+    gemm_event =
+        launch_gemm_with_transpose<jm_M, jm_N, jm_K, inp_jmT, out_jmT, true>(
+            sb_handle, _trans_a, _trans_b, _M, _N, _K, _alpha, _a, _lda,
+            _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, batch_size,
+            batch_type, _dependencies);
+  } else {
+    gemm_event =
+        launch_gemm_with_transpose<jm_M, jm_N, jm_K, inp_jmT, out_jmT, false>(
+            sb_handle, _trans_a, _trans_b, _M, _N, _K, _alpha, _a, _lda,
+            _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, batch_size,
+            batch_type, _dependencies);
+  }
+  return gemm_event;
+}
diff --git a/test/unittest/joint_matrix/tf32_float_16_16_8.cpp b/test/unittest/joint_matrix/tf32_float_16_16_8.cpp
new file mode 100644
index 000000000..f7facadc3
--- /dev/null
+++ b/test/unittest/joint_matrix/tf32_float_16_16_8.cpp
@@ -0,0 +1,99 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename tf32_float_16_16_8.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "joint_matrix_common.hpp"
+
+template <typename scalar_t>
+const auto SmallMatricesTF32Floatm16n16k8 = ::testing::Combine(
+    ::testing::Values("tf32"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(8),                          // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(11, 16, 32, 63),             // m
+    ::testing::Values(11, 16, 32, 63),             // n
+    ::testing::Values(17, 33, 64),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, SmallMatricesTF32Floatm16n16k8);
+
+template <typename scalar_t>
+const auto MediumMatricesTF32Floatm16n16k8 = ::testing::Combine(
+    ::testing::Values("tf32"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(8),                          // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(65, 127, 234, 511),          // m
+    ::testing::Values(65, 127, 234, 511),          // n
+    ::testing::Values(65, 127),                    // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, MediumMatricesTF32Floatm16n16k8);
+
+template <typename scalar_t>
+const auto LargeMatricesTF32Floatm16n16k8 = ::testing::Combine(
+    ::testing::Values("tf32"),                     // input type
+    ::testing::Values("float"),                    // output type
+    ::testing::Values(16),                         // jm_m
+    ::testing::Values(16),                         // jm_n
+    ::testing::Values(8),                          // jm_k
+    ::testing::Values("usm", "buf"),               // allocation type
+    ::testing::Values(33),                         // offset
+    ::testing::Values(1),                          // batch
+    ::testing::Values(1024, 1535, 2024),           // m
+    ::testing::Values(1024, 1535, 2024),           // n
+    ::testing::Values(1536, 2049),                 // k
+    ::testing::Values('n', 't'),                   // transa
+    ::testing::Values('n', 't'),                   // transb
+    ::testing::Values<scalar_t>(1.5),              // alpha
+    ::testing::Values<scalar_t>(0, 1.5),           // beta
+    ::testing::Values(1),                          // lda_mul
+    ::testing::Values(1),                          // ldb_mul
+    ::testing::Values(1),                          // ldc_mul
+    ::testing::Values(gemm_batch_type_t::strided)  // batch_type
+);
+GENERATE_JOINTMATRIX_TEST(JointMatrix, LargeMatricesTF32Floatm16n16k8);