diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 031c11a2f..e882f8ee7 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -29,152 +29,170 @@ namespace column_major { // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, + int64_t incx, int64_t stridex, sycl::buffer &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, + int64_t incx, int64_t stridex, sycl::buffer &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue &queue, int64_t n, + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue &queue, int64_t n, + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue &queue, int64_t n, float alpha, + sycl::buffer &x, int64_t incx, int64_t stridex, + sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, - int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); +void axpy_batch(sycl::queue &queue, int64_t n, double alpha, + sycl::buffer &x, int64_t incx, int64_t stridex, + sycl::buffer &y, int64_t incy, int64_t stridey, + int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, float beta, sycl::buffer &y, int64_t incy, - int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &x, int64_t incx, + int64_t stride_x, float beta, sycl::buffer &y, + int64_t incy, int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &x, int64_t incx, + int64_t stride_x, double beta, sycl::buffer &y, + int64_t incy, int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, sycl::buffer, 1> &y, - int64_t incy, int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, std::complex beta, + sycl::buffer, 1> &y, int64_t incy, + int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, - int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, std::complex beta, + sycl::buffer, 1> &y, int64_t incy, + int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, sycl::buffer &c, int64_t ldc, - int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + sycl::buffer &a, int64_t lda, int64_t stride_a, + sycl::buffer &x, int64_t incx, int64_t stride_x, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } template -inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, Ts alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - Ts beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - using cuTypeA = typename CudaEquivalentType::Type; - using cuTypeB = typename CudaEquivalentType::Type; - using cuTypeC = typename CudaEquivalentType::Type; - using cuTypeS = typename CudaEquivalentType::Type; - overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size); - - cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - queue.submit([&](sycl::handler &cgh) { - if (!verify_support(queue, sycl::aspect::fp16)) { - throw oneapi::mkl::unimplemented( - "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); - } - auto a_acc = a.template get_access(cgh); - auto b_acc = b.template get_access(cgh); - auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { - auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); - cublasStatus_t err; +inline void gemm_batch_impl(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, int64_t k, + Ts alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, + int64_t ldb, int64_t stride_b, Ts beta, + sycl::buffer &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + using cuTypeA = typename CudaEquivalentType::Type; + using cuTypeB = typename CudaEquivalentType::Type; + using cuTypeC = typename CudaEquivalentType::Type; + using cuTypeS = typename CudaEquivalentType::Type; + overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, + batch_size); + + cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; + queue.submit([&](sycl::handler &cgh) { + if (!verify_support(queue, + sycl::aspect::fp16)) { + throw oneapi::mkl::unimplemented( + "blas", "sycl::half", + "half is not supported by the device or the sycl compiler"); + } + auto a_acc = a.template get_access(cgh); + auto b_acc = b.template get_access(cgh); + auto c_acc = c.template get_access(cgh); + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + auto handle = sc.get_handle(queue); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); + cublasStatus_t err; #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - CUBLAS_ERROR_FUNC_T("cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, - err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, &alpha, a_, - get_cublas_datatype(), lda, stride_a, b_, - get_cublas_datatype(), ldb, stride_b, &beta, c_, - get_cublas_datatype(), ldc, stride_c, batch_size, - get_cublas_datatype(), cublas_gemm_algo); + CUBLAS_ERROR_FUNC_T( + "cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle, + get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, + &alpha, a_, get_cublas_datatype(), lda, stride_a, b_, + get_cublas_datatype(), ldb, stride_b, &beta, c_, + get_cublas_datatype(), ldc, stride_c, batch_size, + get_cublas_datatype(), cublas_gemm_algo); #else CUBLAS_ERROR_FUNC_T_SYNC("cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle, @@ -186,449 +204,527 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran batch_size, get_cublas_datatype(), cublas_gemm_algo); #endif - }); }); + }); } -#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size) { \ - gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, \ - lda, stride_a, b, ldb, stride_b, beta, c, \ - ldc, stride_c, batch_size); \ - } +#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, \ + int64_t m, int64_t n, int64_t k, TYPE_S alpha, \ + sycl::buffer &a, int64_t lda, int64_t stride_a, \ + sycl::buffer &b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer &c, int64_t ldc, \ + int64_t stride_c, int64_t batch_size) { \ + gemm_batch_impl( \ + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, \ + stride_b, beta, c, ldc, stride_c, batch_size); \ + } GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float) GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float) GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double) -GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::complex, - std::complex) -GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::complex, - std::complex) +GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, + std::complex, std::complex) +GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_STRIDED_BATCH_LAUNCHER -#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size) { \ - throw unimplemented("blas", "gemm_batch", \ - std::string("for dtype unimplemented dtype combination <") + \ - dtype_string() + "," + dtype_string() + "," + \ - dtype_string() + "," + dtype_string() + ">"); \ - } +#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, \ + int64_t m, int64_t n, int64_t k, TYPE_S alpha, \ + sycl::buffer &a, int64_t lda, int64_t stride_a, \ + sycl::buffer &b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer &c, int64_t ldc, \ + int64_t stride_c, int64_t batch_size) { \ + throw unimplemented( \ + "blas", "gemm_batch", \ + std::string("for dtype unimplemented dtype combination <") + \ + dtype_string() + "," + dtype_string() + "," + \ + dtype_string() + "," + dtype_string() + ">"); \ + } GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float) GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, float alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, float beta, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, double alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, double beta, + sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, - int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, - int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, - int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + float alpha, sycl::buffer &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, - int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + double alpha, sycl::buffer &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, - int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + std::complex alpha, + sycl::buffer, 1> &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, - int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + std::complex alpha, + sycl::buffer, 1> &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, float alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, float beta, + sycl::buffer &b, int64_t ldb, int64_t stride_b, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, double alpha, + sycl::buffer &a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer &b, int64_t ldb, + int64_t stride_b, sycl::buffer &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); + sycl::buffer, 1> &b, int64_t ldb, + int64_t stride_b, sycl::buffer, 1> &c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1> &b, int64_t ldb, + int64_t stride_b, sycl::buffer, 1> &c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, + int64_t *incx, float **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, + int64_t *incx, double **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t *n, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t *n, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, - std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, + int64_t incx, std::int64_t stridex, float *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, + int64_t incx, std::int64_t stridex, double *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, - std::int64_t stridey, std::int64_t batch_size, +sycl::event copy_batch(sycl::queue &queue, int64_t n, + const std::complex *x, int64_t incx, + std::int64_t stridex, std::complex *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, - std::int64_t stridey, std::int64_t batch_size, +sycl::event copy_batch(sycl::queue &queue, int64_t n, + const std::complex *x, int64_t incx, + std::int64_t stridex, std::complex *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for column_major layout"); + throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, + const float **x, int64_t *incx, float **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, + const double **x, int64_t *incx, double **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, + std::complex *alpha, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, + std::complex *alpha, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, + const float *x, int64_t incx, int64_t stridex, float *y, + int64_t incy, int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); +sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, + const double *x, int64_t incx, int64_t stridex, + double *y, int64_t incy, int64_t stridey, + int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::complex *x, int64_t incx, + int64_t stridex, std::complex *y, int64_t incy, + int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t n, + std::complex alpha, + const std::complex *x, int64_t incx, + int64_t stridex, std::complex *y, int64_t incy, + int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for column_major layout"); + throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, float alpha, const float *a, int64_t lda, + int64_t stride_a, const float *x, int64_t incx, + int64_t stride_x, float beta, float *y, int64_t incy, + int64_t stride_y, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, double alpha, const double *a, int64_t lda, + int64_t stride_a, const double *x, int64_t incx, + int64_t stride_x, double beta, double *y, int64_t incy, int64_t stride_y, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, - int64_t incy, int64_t stride_y, int64_t batch_size, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex beta, + std::complex *y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, + std::complex beta, std::complex *y, int64_t incy, int64_t stride_y, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *groupsize, +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, float *alpha, const float **a, int64_t *lda, + const float **x, int64_t *incx, float *beta, float **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, double *alpha, const double **a, + int64_t *lda, const double **x, int64_t *incx, + double *beta, double **y, int64_t *incy, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex *beta, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for column_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex *beta, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const float *a, int64_t lda, int64_t stride_a, + const float *x, int64_t incx, int64_t stride_x, float *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stride_a, const double *x, int64_t incx, - int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const double *a, int64_t lda, + int64_t stride_a, const double *x, int64_t incx, + int64_t stride_x, double *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const float **a, int64_t *lda, + const float **x, int64_t *incx, float **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const double **a, int64_t *lda, + const double **x, int64_t *incx, double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } template -inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, int64_t k, - Ts alpha, const Ta *a, int64_t lda, int64_t stride_a, - const Tb *b, int64_t ldb, int64_t stride_b, Ts beta, - Tc *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, - const std::vector &dependencies) { - using cuTypeA = typename CudaEquivalentType::Type; - using cuTypeB = typename CudaEquivalentType::Type; - using cuTypeC = typename CudaEquivalentType::Type; - using cuTypeS = typename CudaEquivalentType::Type; - overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size); - - cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - auto done = queue.submit([&](sycl::handler &cgh) { - if (!verify_support(queue, sycl::aspect::fp16)) { - throw oneapi::mkl::unimplemented( - "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); - } - int64_t num_events = dependencies.size(); - for (int64_t i = 0; i < num_events; i++) { - cgh.depends_on(dependencies[i]); - } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { - auto handle = sc.get_handle(queue); - cublasStatus_t err; +inline sycl::event gemm_batch_strided_usm_impl( + sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, Ts alpha, const Ta *a, int64_t lda, int64_t stride_a, + const Tb *b, int64_t ldb, int64_t stride_b, Ts beta, Tc *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + using cuTypeA = typename CudaEquivalentType::Type; + using cuTypeB = typename CudaEquivalentType::Type; + using cuTypeC = typename CudaEquivalentType::Type; + using cuTypeS = typename CudaEquivalentType::Type; + overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, + batch_size); + + cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; + auto done = queue.submit([&](sycl::handler &cgh) { + if (!verify_support(queue, + sycl::aspect::fp16)) { + throw oneapi::mkl::unimplemented( + "blas", "sycl::half", + "half is not supported by the device or the sycl compiler"); + } + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + auto handle = sc.get_handle(queue); + cublasStatus_t err; #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - CUBLAS_ERROR_FUNC_T("cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, - err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, &alpha, a, - get_cublas_datatype(), lda, stride_a, b, - get_cublas_datatype(), ldb, stride_b, &beta, c, - get_cublas_datatype(), ldc, stride_c, batch_size, - get_cublas_datatype(), cublas_gemm_algo); + CUBLAS_ERROR_FUNC_T( + "cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle, + get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, + &alpha, a, get_cublas_datatype(), lda, stride_a, b, + get_cublas_datatype(), ldb, stride_b, &beta, c, + get_cublas_datatype(), ldc, stride_c, batch_size, + get_cublas_datatype(), cublas_gemm_algo); #else CUBLAS_ERROR_FUNC_T_SYNC("cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle, @@ -640,44 +736,47 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra batch_size, get_cublas_datatype(), cublas_gemm_algo); #endif - }); }); - return done; -} - -#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ - return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, \ - stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, \ - batch_size, dependencies); \ - } + }); + return done; +} + +#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ + int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector &dependencies) { \ + return gemm_batch_strided_usm_impl( \ + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, \ + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); \ + } GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double) -GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) -GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) +GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) +GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM -#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ - throw unimplemented("blas", "gemm_batch", \ - std::string("for dtype unimplemented dtype combination <") + \ - dtype_string() + "," + dtype_string() + "," + \ - dtype_string() + "," + dtype_string() + ">"); \ - } +#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ + int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector &dependencies) { \ + throw unimplemented( \ + "blas", "gemm_batch", \ + std::string("for dtype unimplemented dtype combination <") + \ + dtype_string() + "," + dtype_string() + "," + \ + dtype_string() + "," + dtype_string() + ">"); \ + } GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) @@ -685,45 +784,48 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb, - int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a, - int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { - using cuTypeA = typename CudaEquivalentType::Type; - using cuTypeB = typename CudaEquivalentType::Type; - using cuTypeC = typename CudaEquivalentType::Type; - using cuTypeS = typename CudaEquivalentType::Type; - for (int64_t i = 0; i < group_count; i++) { - overflow_check(m[i], n[i], k[i], lda[i], ldb[i], ldc[i], group_size[i]); +inline sycl::event +gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb, + int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a, + int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c, + int64_t *ldc, int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + using cuTypeA = typename CudaEquivalentType::Type; + using cuTypeB = typename CudaEquivalentType::Type; + using cuTypeC = typename CudaEquivalentType::Type; + using cuTypeS = typename CudaEquivalentType::Type; + for (int64_t i = 0; i < group_count; i++) { + overflow_check(m[i], n[i], k[i], lda[i], ldb[i], ldc[i], group_size[i]); + } + + cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; + auto done = queue.submit([&](sycl::handler &cgh) { + if (!verify_support(queue, + sycl::aspect::fp16)) { + throw oneapi::mkl::unimplemented( + "blas", "sycl::half", + "half is not supported by the device or the sycl compiler"); } - - cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - auto done = queue.submit([&](sycl::handler &cgh) { - if (!verify_support(queue, sycl::aspect::fp16)) { - throw oneapi::mkl::unimplemented( - "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); - } - int64_t num_events = dependencies.size(); - for (int64_t i = 0; i < num_events; i++) { - cgh.depends_on(dependencies[i]); - } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { - auto handle = sc.get_handle(queue); - int64_t offset = 0; - cublasStatus_t err; - for (int64_t i = 0; i < group_count; i++) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + auto handle = sc.get_handle(queue); + int64_t offset = 0; + cublasStatus_t err; + for (int64_t i = 0; i < group_count; i++) { #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - CUBLAS_ERROR_FUNC_T("cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle, - get_cublas_operation(transa[i]), - get_cublas_operation(transb[i]), (int)m[i], (int)n[i], - (int)k[i], &alpha[i], (const void *const *)(a + offset), - get_cublas_datatype(), (int)lda[i], - (const void *const *)(b + offset), - get_cublas_datatype(), (int)ldb[i], &beta[i], - (void *const *)(c + offset), get_cublas_datatype(), - (int)ldc[i], (int)group_size[i], - get_cublas_datatype(), cublas_gemm_algo); + CUBLAS_ERROR_FUNC_T( + "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle, + get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), + (int)m[i], (int)n[i], (int)k[i], &alpha[i], + (const void *const *)(a + offset), get_cublas_datatype(), + (int)lda[i], (const void *const *)(b + offset), + get_cublas_datatype(), (int)ldb[i], &beta[i], + (void *const *)(c + offset), get_cublas_datatype(), + (int)ldc[i], (int)group_size[i], get_cublas_datatype(), + cublas_gemm_algo); #else CUBLAS_ERROR_FUNC_T_SYNC( "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle, @@ -735,127 +837,139 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr get_cublas_datatype(), (int)ldc[i], (int)group_size[i], get_cublas_datatype(), cublas_gemm_algo); #endif - offset += group_size[i]; - } - }); + offset += group_size[i]; + } }); - return done; -} - -#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ - return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ - ldc, group_count, group_size, dependencies); \ - } + }); + return done; +} + +#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ + int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ + const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ + int64_t group_count, int64_t *group_size, \ + const std::vector &dependencies) { \ + return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, \ + b, ldb, beta, c, ldc, group_count, group_size, \ + dependencies); \ + } GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) GEMM_BATCH_LAUNCHER_USM(float, float, float, float) GEMM_BATCH_LAUNCHER_USM(double, double, double, double) -GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) -GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) +GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) +GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_BATCH_LAUNCHER_USM -#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ - throw unimplemented("blas", "gemm_batch", \ - std::string("for dtype unimplemented dtype combination <") + \ - dtype_string() + "," + dtype_string() + "," + \ - dtype_string() + "," + dtype_string() + ">"); \ - } +#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ + int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ + const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ + int64_t group_count, int64_t *group_size, \ + const std::vector &dependencies) { \ + throw unimplemented( \ + "blas", "gemm_batch", \ + std::string("for dtype unimplemented dtype combination <") + \ + dtype_string() + "," + dtype_string() + "," + \ + dtype_string() + "," + dtype_string() + ">"); \ + } GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float) GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_BATCH_LAUNCHER_USM -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + float alpha, const float *a, int64_t lda, + int64_t stride_a, float *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + double alpha, const double *a, int64_t lda, + int64_t stride_a, double *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, + int64_t lda, int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for column_major layout"); + throw unimplemented("blas", "trsm_batch", "for column_major layout"); } template -inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, - int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - using cuDataType = typename CudaEquivalentType::Type; - for (int64_t i = 0; i < group_count; i++) { - overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]); +inline sycl::event +trsm_batch(const char *func_name, Func func, sycl::queue &queue, + side *left_right, uplo *upper_lower, transpose *trans, + diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, + int64_t *lda, T **b, int64_t *ldb, int64_t group_count, + int64_t *group_size, const std::vector &dependencies) { + using cuDataType = typename CudaEquivalentType::Type; + for (int64_t i = 0; i < group_count; i++) { + overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]); + } + auto done = queue.submit([&](sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { - int64_t num_events = dependencies.size(); - for (int64_t i = 0; i < num_events; i++) { - cgh.depends_on(dependencies[i]); - } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { - auto handle = sc.get_handle(queue); - int64_t offset = 0; - cublasStatus_t err; - for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **b_ = reinterpret_cast(b); - cublas_native_named_func(func_name, func, err, handle, - get_cublas_side_mode(left_right[i]), - get_cublas_fill_mode(upper_lower[i]), - get_cublas_operation(trans[i]), - get_cublas_diag_type(unit_diag[i]), (int)m[i], (int)n[i], - (cuDataType *)&alpha[i], a_ + offset, (int)lda[i], - b_ + offset, (int)ldb[i], (int)group_size[i]); - offset += group_size[i]; - } - }); + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + auto handle = sc.get_handle(queue); + int64_t offset = 0; + cublasStatus_t err; + for (int64_t i = 0; i < group_count; i++) { + auto **a_ = reinterpret_cast(a); + auto **b_ = reinterpret_cast(b); + cublas_native_named_func( + func_name, func, err, handle, get_cublas_side_mode(left_right[i]), + get_cublas_fill_mode(upper_lower[i]), + get_cublas_operation(trans[i]), get_cublas_diag_type(unit_diag[i]), + (int)m[i], (int)n[i], (cuDataType *)&alpha[i], a_ + offset, + (int)lda[i], b_ + offset, (int)ldb[i], (int)group_size[i]); + offset += group_size[i]; + } }); - return done; -} - -#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ - return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ - unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, \ - dependencies); \ - } + }); + return done; +} + +#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + sycl::event trsm_batch(sycl::queue &queue, side *left_right, \ + uplo *upper_lower, transpose *trans, diag *unit_diag, \ + int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \ + int64_t *lda, TYPE **b, int64_t *ldb, \ + int64_t group_count, int64_t *group_size, \ + const std::vector &dependencies) { \ + return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, \ + upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, \ + ldb, group_count, group_size, dependencies); \ + } TRSM_BATCH_LAUNCHER_USM(float, cublasStrsmBatched) TRSM_BATCH_LAUNCHER_USM(double, cublasDtrsmBatched) @@ -864,209 +978,249 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, cublasZtrsmBatched) #undef TRSM_BATCH_LAUNCHER_USM -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, float *alpha, const float **a, + int64_t *lda, float *beta, float **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, double *alpha, const double **a, + int64_t *lda, double *beta, double **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex *beta, std::complex **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex *beta, std::complex **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, float alpha, const float *a, + int64_t lda, int64_t stride_a, float beta, float *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, double alpha, const double *a, + int64_t lda, int64_t stride_a, double beta, double *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + std::complex *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + std::complex *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for column_major layout"); + throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, float alpha, const float *a, int64_t lda, + int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, - int64_t stride_b, int64_t batch_size, +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, double alpha, const double *a, + int64_t lda, int64_t stride_a, double *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, float alpha, float *ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, double alpha, double *ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, - int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + std::complex *ab, int64_t lda, int64_t ldb, + int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, - int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + std::complex *ab, int64_t lda, int64_t ldb, + int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, float alpha, + const float *a, int64_t lda, int64_t stride_a, + float beta, const float *b, int64_t ldb, + int64_t stride_b, float *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, double alpha, + const double *a, int64_t lda, int64_t stride_a, + double beta, const double *b, int64_t ldb, + int64_t stride_b, double *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + const std::complex *b, int64_t ldb, + int64_t stride_b, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + const std::complex *b, int64_t ldb, + int64_t stride_b, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, float *alpha, const float **a, + int64_t *lda, float **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, double *alpha, const double **a, + int64_t *lda, double **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex **b, int64_t *ldb, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, float *alpha, float **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, double *alpha, double **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + std::complex **ab, int64_t *lda, int64_t *ldb, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + std::complex **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } } // namespace column_major @@ -1074,125 +1228,139 @@ namespace row_major { // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, + int64_t incx, int64_t stridex, sycl::buffer &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, + int64_t incx, int64_t stridex, sycl::buffer &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue &queue, int64_t n, + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue &queue, int64_t n, + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue &queue, int64_t n, float alpha, + sycl::buffer &x, int64_t incx, int64_t stridex, + sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, - int64_t stridey, int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); +void axpy_batch(sycl::queue &queue, int64_t n, double alpha, + sycl::buffer &x, int64_t incx, int64_t stridex, + sycl::buffer &y, int64_t incy, int64_t stridey, + int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, - int64_t batch_size) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + sycl::buffer, 1> &x, int64_t incx, + int64_t stridex, sycl::buffer, 1> &y, + int64_t incy, int64_t stridey, int64_t batch_size) { + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, float beta, sycl::buffer &y, int64_t incy, - int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &x, int64_t incx, + int64_t stride_x, float beta, sycl::buffer &y, + int64_t incy, int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &x, int64_t incx, + int64_t stride_x, double beta, sycl::buffer &y, + int64_t incy, int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, sycl::buffer, 1> &y, - int64_t incy, int64_t stride_y, int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, std::complex beta, + sycl::buffer, 1> &y, int64_t incy, + int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, - int64_t batch_size) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, std::complex beta, + sycl::buffer, 1> &y, int64_t incy, + int64_t stride_y, int64_t batch_size) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, sycl::buffer &c, int64_t ldc, - int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + sycl::buffer &a, int64_t lda, int64_t stride_a, + sycl::buffer &x, int64_t incx, int64_t stride_x, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &x, + int64_t incx, int64_t stride_x, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size) { \ - throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ - } +#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, \ + int64_t m, int64_t n, int64_t k, TYPE_S alpha, \ + sycl::buffer &a, int64_t lda, int64_t stride_a, \ + sycl::buffer &b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer &c, int64_t ldc, \ + int64_t stride_c, int64_t batch_size) { \ + throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ + } GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float) @@ -1200,386 +1368,460 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float) GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float) GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float) GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double) -GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::complex, - std::complex) -GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::complex, - std::complex) +GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, + std::complex, std::complex) +GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_STRIDED_BATCH_LAUNCHER -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - int64_t batch_size) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); +void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, float alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, float beta, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, double alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, double beta, + sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, - int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + sycl::buffer, 1> &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + float alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + double alpha, sycl::buffer &a, int64_t lda, + int64_t stride_a, sycl::buffer &b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, - int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, - int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + float alpha, sycl::buffer &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, - int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, + double alpha, sycl::buffer &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, - int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + std::complex alpha, + sycl::buffer, 1> &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, - int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + std::complex alpha, + sycl::buffer, 1> &ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size) { + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, float alpha, sycl::buffer &a, + int64_t lda, int64_t stride_a, float beta, + sycl::buffer &b, int64_t ldb, int64_t stride_b, + sycl::buffer &c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, double alpha, + sycl::buffer &a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer &b, int64_t ldb, + int64_t stride_b, sycl::buffer &c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); + sycl::buffer, 1> &b, int64_t ldb, + int64_t stride_b, sycl::buffer, 1> &c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1> &a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1> &b, int64_t ldb, + int64_t stride_b, sycl::buffer, 1> &c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, + int64_t *incx, float **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, + int64_t *incx, double **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t *n, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t *n, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, - std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, + int64_t incx, std::int64_t stridex, float *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); +sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, + int64_t incx, std::int64_t stridex, double *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, - std::int64_t stridey, std::int64_t batch_size, +sycl::event copy_batch(sycl::queue &queue, int64_t n, + const std::complex *x, int64_t incx, + std::int64_t stridex, std::complex *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, - std::int64_t stridey, std::int64_t batch_size, +sycl::event copy_batch(sycl::queue &queue, int64_t n, + const std::complex *x, int64_t incx, + std::int64_t stridex, std::complex *y, + int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "copy_batch", "for row_major layout"); + throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, + const float **x, int64_t *incx, float **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, + const double **x, int64_t *incx, double **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const std::vector &dependencies) { + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, + std::complex *alpha, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t *n, + std::complex *alpha, + const std::complex **x, int64_t *incx, + std::complex **y, int64_t *incy, + int64_t group_count, int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, + const float *x, int64_t incx, int64_t stridex, float *y, + int64_t incy, int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); +sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, + const double *x, int64_t incx, int64_t stridex, + double *y, int64_t incy, int64_t stridey, + int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::complex *x, int64_t incx, + int64_t stridex, std::complex *y, int64_t incy, + int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, +sycl::event axpy_batch(sycl::queue &queue, int64_t n, + std::complex alpha, + const std::complex *x, int64_t incx, + int64_t stridex, std::complex *y, int64_t incy, + int64_t stridey, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "axpy_batch", "for row_major layout"); + throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, float alpha, const float *a, int64_t lda, + int64_t stride_a, const float *x, int64_t incx, + int64_t stride_x, float beta, float *y, int64_t incy, + int64_t stride_y, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, double alpha, const double *a, int64_t lda, + int64_t stride_a, const double *x, int64_t incx, + int64_t stride_x, double beta, double *y, int64_t incy, int64_t stride_y, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, - int64_t incy, int64_t stride_y, int64_t batch_size, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex beta, + std::complex *y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, + std::complex beta, std::complex *y, int64_t incy, int64_t stride_y, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *groupsize, +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, float *alpha, const float **a, int64_t *lda, + const float **x, int64_t *incx, float *beta, float **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, double *alpha, const double **a, + int64_t *lda, const double **x, int64_t *incx, + double *beta, double **y, int64_t *incy, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex *beta, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "gemv_batch", "for row_major layout"); +sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex *beta, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const float *a, int64_t lda, int64_t stride_a, + const float *x, int64_t incx, int64_t stride_x, float *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stride_a, const double *x, int64_t incx, - int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const double *a, int64_t lda, + int64_t stride_a, const double *x, int64_t incx, + int64_t stride_x, double *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, + int64_t n, const std::complex *a, int64_t lda, + int64_t stride_a, const std::complex *x, + int64_t incx, int64_t stride_x, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const float **a, int64_t *lda, + const float **x, int64_t *incx, float **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const double **a, int64_t *lda, + const double **x, int64_t *incx, double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, + int64_t *n, const std::complex **a, int64_t *lda, + const std::complex **x, int64_t *incx, + std::complex **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ - throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ - } +#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ + int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector &dependencies) { \ + throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ + } GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) @@ -1587,21 +1829,22 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float) GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double) -GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) -GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) +GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) +GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM -#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ - throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ - } +#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ + sycl::event gemm_batch( \ + sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ + int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ + const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ + int64_t group_count, int64_t *group_size, \ + const std::vector &dependencies) { \ + throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ + } GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half) GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) @@ -1609,62 +1852,71 @@ GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float) GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) GEMM_BATCH_LAUNCHER_USM(float, float, float, float) GEMM_BATCH_LAUNCHER_USM(double, double, double, double) -GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) -GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex, - std::complex) +GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) +GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, + std::complex, std::complex) #undef GEMM_BATCH_LAUNCHER_USM -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + float alpha, const float *a, int64_t lda, + int64_t stride_a, float *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + double alpha, const double *a, int64_t lda, + int64_t stride_a, double *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, + int64_t lda, int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, +sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, int64_t ldb, + int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); + throw unimplemented("blas", "trsm_batch", "for row_major layout"); } template -inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, - int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { - throw unimplemented("blas", "trsm_batch", "for row_major layout"); -} - -#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ - return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ - unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, \ - dependencies); \ - } +inline sycl::event +trsm_batch(const char *func_name, Func func, sycl::queue &queue, + side *left_right, uplo *upper_lower, transpose *trans, + diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, + int64_t *lda, T **b, int64_t *ldb, int64_t group_count, + int64_t *group_size, const std::vector &dependencies) { + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +} + +#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + sycl::event trsm_batch(sycl::queue &queue, side *left_right, \ + uplo *upper_lower, transpose *trans, diag *unit_diag, \ + int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \ + int64_t *lda, TYPE **b, int64_t *ldb, \ + int64_t group_count, int64_t *group_size, \ + const std::vector &dependencies) { \ + return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, \ + upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, \ + ldb, group_count, group_size, dependencies); \ + } TRSM_BATCH_LAUNCHER_USM(float, cublasStrsmBatched) TRSM_BATCH_LAUNCHER_USM(double, cublasDtrsmBatched) @@ -1673,209 +1925,249 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, cublasZtrsmBatched) #undef TRSM_BATCH_LAUNCHER_USM -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, float *alpha, const float **a, + int64_t *lda, float *beta, float **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, double *alpha, const double **a, + int64_t *lda, double *beta, double **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex *beta, std::complex **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, +sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex *beta, std::complex **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, float alpha, const float *a, + int64_t lda, int64_t stride_a, float beta, float *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, double alpha, const double *a, + int64_t lda, int64_t stride_a, double beta, double *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + std::complex *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + std::complex *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "syrk_batch", "for row_major layout"); + throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, float alpha, const float *a, int64_t lda, + int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, - int64_t stride_b, int64_t batch_size, +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, double alpha, const double *a, + int64_t lda, int64_t stride_a, double *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex *b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, float alpha, float *ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, double alpha, double *ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, - int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + std::complex *ab, int64_t lda, int64_t ldb, + int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, - int64_t ldb, int64_t stride, int64_t batch_size, +sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, + int64_t n, std::complex alpha, + std::complex *ab, int64_t lda, int64_t ldb, + int64_t stride, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, float alpha, + const float *a, int64_t lda, int64_t stride_a, + float beta, const float *b, int64_t ldb, + int64_t stride_b, float *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, - int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, double alpha, + const double *a, int64_t lda, int64_t stride_a, + double beta, const double *b, int64_t ldb, + int64_t stride_b, double *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + const std::complex *b, int64_t ldb, + int64_t stride_b, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { - throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +sycl::event omatadd_batch(sycl::queue &queue, transpose transa, + transpose transb, int64_t m, int64_t n, + std::complex alpha, + const std::complex *a, int64_t lda, + int64_t stride_a, std::complex beta, + const std::complex *b, int64_t ldb, + int64_t stride_b, std::complex *c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector &dependencies) { + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, float *alpha, const float **a, + int64_t *lda, float **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, double *alpha, const double **a, + int64_t *lda, double **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex **b, int64_t *ldb, + int64_t group_count, int64_t *groupsize, + const std::vector &dependencies) { + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, +sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + const std::complex **a, int64_t *lda, + std::complex **b, int64_t *ldb, int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, float *alpha, float **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, double *alpha, double **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + std::complex **ab, int64_t *lda, int64_t *ldb, + int64_t group_count, int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, +sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, + int64_t *n, std::complex *alpha, + std::complex **ab, int64_t *lda, + int64_t *ldb, int64_t group_count, + int64_t *groupsize, const std::vector &dependencies) { - throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } } // namespace row_major