diff --git a/cudax/examples/stf/linear_algebra/06-pdgemm.cu b/cudax/examples/stf/linear_algebra/06-pdgemm.cu index 07835093b71..d68b0249e19 100644 --- a/cudax/examples/stf/linear_algebra/06-pdgemm.cu +++ b/cudax/examples/stf/linear_algebra/06-pdgemm.cu @@ -160,9 +160,9 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -251,14 +251,14 @@ void PDGEMM(stream_ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB) cuda_safe_call(cudaGetDeviceCount(&ndevs)); /* Warm up allocators */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; } /* Initializes CUBLAS on all devices */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { cuda_safe_call(cudaSetDevice(d)); get_cublas_handle(); diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu index 144721ff184..578c9a0b95d 100644 --- a/cudax/examples/stf/linear_algebra/07-cholesky.cu +++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -392,24 +392,24 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); cuda_safe_call(cudaSetDevice(0)); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { int dev_akk = A.get_preferred_devid(K, K); cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -450,17 +450,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -473,17 +473,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -540,14 +540,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -559,7 +559,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -570,7 +570,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -584,7 +584,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -595,7 +595,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -637,7 +637,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; @@ -688,9 +688,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu index e3fb3dd55b7..6855e563578 100644 --- a/cudax/examples/stf/linear_algebra/07-potri.cu +++ b/cudax/examples/stf/linear_algebra/07-potri.cu @@ -93,10 +93,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = get_handle(rowb, colb); @@ -173,10 +173,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -804,9 +804,9 @@ void PDNRM2_HOST(matrix* A, double* result) ctx.get_dot()->set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->get_handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -833,21 +833,21 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { cuda_try(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_try(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_try(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -888,17 +888,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -911,17 +911,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -983,16 +983,16 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { cuda_try(cudaSetDevice(C.get_preferred_devid(m, n))); //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -1005,7 +1005,7 @@ void PDGEMM(cublasOperation_t transa, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -1016,7 +1016,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -1030,7 +1030,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -1041,7 +1041,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -1062,22 +1062,22 @@ void PDTRTRI(matrix& A, cublasFillMode_t uplo, cublasDiagType_t diag) nvtxRangePushA("SUBMIT_PDTRTRI"); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, -1.0, A, k, k, A, m, k); } - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, m, k, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRSM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, 1.0, A, k, k, A, k, n); @@ -1101,20 +1101,20 @@ void PDLAUUM(matrix& A, cublasFillMode_t uplo) nvtxRangePushA("SUBMIT_PDLAUUM"); - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(n, n))); DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, 1.0, A, k, n, 1.0, A, n, n); - for (int m = n + 1; m < k; m++) + for (size_t m = n + 1; m < k; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, 1.0, A, k, m, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, k, k, A, k, n); @@ -1136,7 +1136,7 @@ void PDSYMM(cublasSideMode_t side, double beta, matrix& C) { - int k, m, n; + size_t k, m, n; double zbeta; double zone = (double) 1.0; @@ -1272,15 +1272,15 @@ void PDTRMM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n); } @@ -1292,9 +1292,9 @@ void PDTRMM(cublasSideMode_t side, //================================================ else { - for (int m = B.mt - 1; m > -1; m--) + for (int m = static_cast(B.mt) - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1315,9 +1315,9 @@ void PDTRMM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int m = B.mt - 1; m > -1; m--) + for (int m = static_cast(B.mt) - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1335,13 +1335,13 @@ void PDTRMM(cublasSideMode_t side, //================================================ else { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n); } @@ -1359,9 +1359,9 @@ void PDTRMM(cublasSideMode_t side, //============================================ if (trans == CUBLAS_OP_N) { - for (int n = B.nt - 1; n > -1; n--) + for (int n = static_cast(B.nt) - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1379,15 +1379,15 @@ void PDTRMM(cublasSideMode_t side, //================================================= else { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n); } @@ -1402,15 +1402,15 @@ void PDTRMM(cublasSideMode_t side, //============================================ if (trans == CUBLAS_OP_N) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n); } @@ -1422,9 +1422,9 @@ void PDTRMM(cublasSideMode_t side, //================================================= else { - for (int n = B.nt - 1; n > -1; n--) + for (int n = static_cast(B.nt) - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1462,7 +1462,7 @@ void run(int N, int NB) int ndevs; cuda_try(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto ldummy = ctx.logical_data(shape_of>(1)); ctx.task(exec_place::device(d), ldummy.write())->*[](cudaStream_t, auto) { diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu index e98f737e524..3f41e4cca04 100644 --- a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu +++ b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu @@ -65,7 +65,7 @@ public: if (is_tmp) { // There is no physical backing for this temporary vector - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -91,7 +91,7 @@ public: { handles.resize(nblocks); - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -107,12 +107,12 @@ public: void fill(const std::function& f) { size_t bs = block_size; - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { ctx.task(exec_place::host, handles[b]->write())->*[&f, b, bs](cudaStream_t stream, auto ds) { cuda_safe_call(cudaStreamSynchronize(stream)); - for (int local_row = 0; local_row < ds.extent(0); local_row++) + for (size_t local_row = 0; local_row < ds.extent(0); local_row++) { ds(local_row) = f(local_row + b * bs); } @@ -234,7 +234,7 @@ class scalar DOT(vector& a, class vector& b) scalar global_res(true); // Loop over all blocks, - for (int bid = 0; bid < a.nblocks; bid++) + for (size_t bid = 0; bid < a.nblocks; bid++) { scalar res(true); @@ -267,7 +267,7 @@ void AXPY(const class scalar& alpha, class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->* @@ -286,7 +286,7 @@ void SCALE_AXPY(const scalar& alpha, const class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->*[](cudaStream_t stream, auto dalpha, auto dx, auto dy) { @@ -315,9 +315,9 @@ void GEMV(double alpha, class matrix& a, class vector& x, double beta, class vec size_t block_size = x.block_size; assert(block_size == y.block_size); - for (int row_y = 0; row_y < y.nblocks; row_y++) + for (size_t row_y = 0; row_y < y.nblocks; row_y++) { - for (int row_x = 0; row_x < x.nblocks; row_x++) + for (size_t row_x = 0; row_x < x.nblocks; row_x++) { double local_beta = (row_x == 0) ? beta : 1.0; diff --git a/cudax/examples/stf/linear_algebra/strassen.cu b/cudax/examples/stf/linear_algebra/strassen.cu index 0b00bd41f48..f06e0a65620 100644 --- a/cudax/examples/stf/linear_algebra/strassen.cu +++ b/cudax/examples/stf/linear_algebra/strassen.cu @@ -417,9 +417,9 @@ void strassen_test(context& ctx, size_t N) cuda_safe_call(cudaHostRegister(B, N * N * sizeof(double), cudaHostRegisterPortable)); cuda_safe_call(cudaHostRegister(C, N * N * sizeof(double), cudaHostRegisterPortable)); - for (int col = 0; col < N; col++) + for (size_t col = 0; col < N; col++) { - for (int row = 0; row < N; row++) + for (size_t row = 0; row < N; row++) { A[row + N * col] = 1.0; B[row + N * col] = -1.0; diff --git a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu index 0d1ec06345b..4c069332e88 100644 --- a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu +++ b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu @@ -75,8 +75,8 @@ __device__ double grid_dot_result = 0.0; void genTridiag(slice I, slice J, slice val, int N, int nz) { I(0) = 0, J(0) = 0, J(1) = 1; - val(0) = (float) rand() / RAND_MAX + 10.0f; - val(1) = (float) rand() / RAND_MAX; + val(0) = (float) rand() / (float)RAND_MAX + 10.0f; + val(1) = (float) rand() / (float)RAND_MAX; int start; for (int i = 1; i < N; i++) @@ -100,11 +100,11 @@ void genTridiag(slice I, slice J, slice val, int N, int nz) } val(start) = val(start - 1); - val(start + 1) = (float) rand() / RAND_MAX + 10.0f; + val(start + 1) = (float) rand() / (float)RAND_MAX + 10.0f; if (i < N - 1) { - val(start + 2) = (float) rand() / RAND_MAX; + val(start + 2) = (float) rand() / (float)RAND_MAX; } } diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu index 3cae7cb6bde..95ca023e11c 100644 --- a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu @@ -289,7 +289,7 @@ int main(int argc, char** argv) int gpuBase, gpuIndex; int i; - double delta, ref, sumDelta, sumRef, sumReserve; + double delta, sumReserve; // printf("MonteCarloMultiGPU\n"); // printf("==================\n"); @@ -369,17 +369,17 @@ int main(int argc, char** argv) } // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; + // sumDelta = 0; + // sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueBS[i]; + // sumDelta += delta; + // sumRef += fabs(ref); if (delta > 1e-6) { @@ -415,17 +415,17 @@ int main(int argc, char** argv) // printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; + // sumDelta = 0; + // sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueBS[i]; + // sumDelta += delta; + // sumRef += fabs(ref); if (delta > 1e-6) { @@ -444,15 +444,15 @@ int main(int argc, char** argv) // printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; - sumRef = 0; + // sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); - ref = callValueCPU.Expected; - sumDelta += delta; - sumRef += fabs(ref); + // ref = callValueCPU.Expected; + // sumDelta += delta; + // sumRef += fabs(ref); // printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); // printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } diff --git a/cudax/test/stf/examples/07-cholesky-redux.cu b/cudax/test/stf/examples/07-cholesky-redux.cu index 765047c8669..6e70f7a6a1d 100644 --- a/cudax/test/stf/examples/07-cholesky-redux.cu +++ b/cudax/test/stf/examples/07-cholesky-redux.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -367,9 +367,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -454,17 +454,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -477,17 +477,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -544,14 +544,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -563,7 +563,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -574,7 +574,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -588,7 +588,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -599,7 +599,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -641,7 +641,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] __device__(size_t, auto) {}; diff --git a/cudax/test/stf/examples/07-cholesky-unified.cu b/cudax/test/stf/examples/07-cholesky-unified.cu index 7a52ed5ac41..c174e66bd85 100644 --- a/cudax/test/stf/examples/07-cholesky-unified.cu +++ b/cudax/test/stf/examples/07-cholesky-unified.cu @@ -84,10 +84,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -162,17 +162,17 @@ public: void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[=, self = this](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*self, row, col); @@ -348,9 +348,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -435,17 +435,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -458,17 +458,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -525,14 +525,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -544,7 +544,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -555,7 +555,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -569,7 +569,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -580,7 +580,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); diff --git a/cudax/test/stf/gnu/06-pdgemm.cpp b/cudax/test/stf/gnu/06-pdgemm.cpp index 1ae2f363c14..850ad136786 100644 --- a/cudax/test/stf/gnu/06-pdgemm.cpp +++ b/cudax/test/stf/gnu/06-pdgemm.cpp @@ -155,21 +155,21 @@ class matrix void fill(T (*func)(matrix*, int, int)) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); #ifdef TILED // tiles are stored contiguously - int ld = mb; + size_t ld = mb; #else - int ld = m; + size_t ld = m; #endif - for (int lrow = 0; lrow < mb; lrow++) + for (size_t lrow = 0; lrow < mb; lrow++) { - for (int lcol = 0; lcol < nb; lcol++) + for (size_t lcol = 0; lcol < nb; lcol++) { size_t row = lrow + rowb * mb; size_t col = lcol + colb * nb; @@ -257,14 +257,14 @@ void PDGEMM(Ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -277,7 +277,7 @@ void PDGEMM(Ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -288,7 +288,7 @@ void PDGEMM(Ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -302,7 +302,7 @@ void PDGEMM(Ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -313,7 +313,7 @@ void PDGEMM(Ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); diff --git a/cudax/test/stf/gnu/07-cholesky.cpp b/cudax/test/stf/gnu/07-cholesky.cpp index 011de211e5c..e7c5f7fbfd1 100644 --- a/cudax/test/stf/gnu/07-cholesky.cpp +++ b/cudax/test/stf/gnu/07-cholesky.cpp @@ -90,10 +90,10 @@ class matrix handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -168,17 +168,17 @@ class matrix void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[this, fun, rowb, colb](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*this, row, col); @@ -351,9 +351,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -380,23 +380,23 @@ void PDPOTRF(matrix& A) assert(A.m == A.n); assert(A.mt == A.nt); - int NBLOCKS = A.mt; + size_t NBLOCKS = A.mt; assert(A.mb == A.nb); cuda_safe_call(cudaSetDevice(0)); nvtxRangePushA("SUBMIT_PDPOTRF"); - for (int K = 0; K < NBLOCKS; K++) + for (size_t K = 0; K < NBLOCKS; K++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); - for (int row = K + 1; row < NBLOCKS; row++) + for (size_t row = K + 1; row < NBLOCKS; row++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); - for (int col = K + 1; col < row; col++) + for (size_t col = K + 1; col < row; col++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); @@ -437,17 +437,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -460,17 +460,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -527,14 +527,14 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C //========================================= - int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + size_t inner_k = transa == CUBLAS_OP_N ? A.n : A.m; if (alpha == 0.0 || inner_k == 0) { DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); @@ -546,7 +546,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -557,7 +557,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -571,7 +571,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -582,7 +582,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -662,9 +662,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col);