diff --git a/src/gemmA.cc b/src/gemmA.cc index a2bdeae9d..67d9a8249 100644 --- a/src/gemmA.cc +++ b/src/gemmA.cc @@ -131,20 +131,19 @@ void gemmA( int tag_0 = 0; C.template listReduce( reduce_list_C, layout, tag_0 ); } - // Clean the memory introduced by internal::gemmA on Devices - if (target == Target::Devices) { - #pragma omp task depend( in:gemmA[ 0 ] ) \ - shared( B, C ) - { - auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 ); - B_col_0.releaseRemoteWorkspace(); - B_col_0.releaseLocalWorkspace(); - - auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 ); - C_col_0.releaseRemoteWorkspace(); - C_col_0.tileUpdateAllOrigin(); - C_col_0.releaseLocalWorkspace(); - } + + // Clean up workspace + #pragma omp task depend( in:gemmA[ 0 ] ) \ + shared( B, C ) + { + auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 ); + B_col_0.releaseRemoteWorkspace(); + B_col_0.releaseLocalWorkspace(); + + auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 ); + C_col_0.releaseRemoteWorkspace(); + C_col_0.tileUpdateAllOrigin(); + C_col_0.releaseLocalWorkspace(); } // broadcast (with lookahead) and multiply the rest of the columns @@ -194,21 +193,20 @@ void gemmA( int tag_k = k; C.template listReduce( reduce_list_C, layout, tag_k ); } - // Clean the memory introduced by internal::gemmA on Devices - if (target == Target::Devices) { - #pragma omp task depend( in:gemmA[ k ] ) \ - shared( B, C ) \ - firstprivate( k ) - { - auto B_col_k = B.sub( 0, B.mt()-1, k, k ); - B_col_k.releaseRemoteWorkspace(); - B_col_k.releaseLocalWorkspace(); - - auto C_col_k = C.sub( 0, C.mt()-1, k, k ); - C_col_k.releaseRemoteWorkspace(); - C_col_k.tileUpdateAllOrigin(); - C_col_k.releaseLocalWorkspace(); - } + + // Clean up workspace + #pragma omp task depend( in:gemmA[ k ] ) \ + shared( B, C ) \ + firstprivate( k ) + { + auto B_col_k = B.sub( 0, B.mt()-1, k, k ); + B_col_k.releaseRemoteWorkspace(); + B_col_k.releaseLocalWorkspace(); + + auto C_col_k = C.sub( 0, C.mt()-1, k, k ); + C_col_k.releaseRemoteWorkspace(); + C_col_k.tileUpdateAllOrigin(); + C_col_k.releaseLocalWorkspace(); } } #pragma omp taskwait diff --git a/src/heev.cc b/src/heev.cc index 34a4eb59e..a8f8894eb 100644 --- a/src/heev.cc +++ b/src/heev.cc @@ -135,6 +135,8 @@ void heev( // Copy diagonal and super-diagonal to vectors. internal::copyhb2st( Aband, Lambda, E ); + + Aband.releaseRemoteWorkspace(); } // 3. Tri-diagonal eigenvalue solver. diff --git a/src/hegst.cc b/src/hegst.cc index 3a091a66f..03c4ae9f8 100644 --- a/src/hegst.cc +++ b/src/hegst.cc @@ -55,11 +55,11 @@ void hegst( uint8_t* column = column_vector.data(); if (target == Target::Devices) { - // The work::trsm (itype=1) and work::trmm (itype=2,3) - // routines use 2 queues (queue 0,1). All other - // internal::routines here use the default queue (queue 0). - // So 2 queues need to be allocated. - A.allocateBatchArrays(0, 2+lookahead); // (batch size, num_queues) + // The work::trsm (itype=1) routine uses 2 queues (queue 0,1). + // The work::trmm (itype=2,3) routine uses 1 queue (queue 0). + // All other internal::routines here use the default queue (queue 0). + int64_t num_queues = (itype == 1) ? 2 : 1; + A.allocateBatchArrays(0, num_queues+lookahead); // (batch size, num_queues) A.reserveDeviceWorkspace(); } diff --git a/src/hemmA.cc b/src/hemmA.cc index 4c450d159..aea49e1d1 100644 --- a/src/hemmA.cc +++ b/src/hemmA.cc @@ -37,6 +37,8 @@ void hemmA( using BcastList = typename Matrix::BcastList; const scalar_t one = 1.0; + const int priority_0 = 0; + const int queue_0 = 0; // Assumes column major const Layout layout = Layout::ColMajor; @@ -44,6 +46,12 @@ void hemmA( // Options int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + // Use only TileReleaseStrategy::Slate for hemmA. + // Internal routines (hemmA and gemmA) called here won't release + // any tiles. This routine will clean up tiles. + Options opts2 = opts; + opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate; + // if on right, change to left by transposing A, B, C to get // op(C) = op(A)*op(B) if (side == Side::Right) { @@ -190,17 +198,26 @@ void hemmA( Side::Left, alpha, A.sub(0, 0), B.sub(0, 0, 0, B.nt()-1), - beta, C.sub(0, 0, 0, C.nt()-1)); + beta, C.sub(0, 0, 0, C.nt()-1), + priority_0, opts2 ); if (A.mt()-1 > 0) { internal::gemmA( alpha, A.sub(1, A.mt()-1, 0, 0), B.sub(0, 0, 0, B.nt()-1), beta, C.sub(1, C.mt()-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + // Clean up workspace + #pragma omp task depend( in:gemm[ 0 ] ) shared( B ) + { + auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 ); + B_col_0.releaseRemoteWorkspace(); + B_col_0.releaseLocalWorkspace(); + } + // Main loop for (int64_t k = 1; k < A.nt(); ++k) { @@ -268,23 +285,31 @@ void hemmA( alpha, conj_transpose( Arow_k ), B.sub(k, k, 0, B.nt()-1), one, C.sub(0, k-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); internal::hemmA( Side::Left, alpha, A.sub(k, k), B.sub(k, k, 0, B.nt()-1), - one, C.sub(k, k, 0, C.nt()-1)); + one, C.sub(k, k, 0, C.nt()-1), + priority_0, opts2 ); if (A.mt()-1 > k) { internal::gemmA( alpha, A.sub(k+1, A.mt()-1, k, k), B.sub(k, k, 0, B.nt()-1), one, C.sub(k+1, C.mt()-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + // Clean up workspace + #pragma omp task depend( in:gemm[ k ] ) shared( B ) + { + auto B_col_k = B.sub( k, k, 0, B.nt()-1 ); + B_col_k.releaseRemoteWorkspace(); + B_col_k.releaseLocalWorkspace(); + } } #pragma omp task depend(in:gemm[A.nt()-1]) @@ -423,7 +448,8 @@ void hemmA( Side::Left, alpha, A.sub(0, 0), B.sub(0, 0, 0, B.nt()-1), - beta, C.sub(0, 0, 0, C.nt()-1)); + beta, C.sub(0, 0, 0, C.nt()-1), + priority_0, opts2 ); if (A.mt()-1 > 0) { auto Arow_k = A.sub(0, 0, 1, A.nt()-1); @@ -431,10 +457,18 @@ void hemmA( alpha, conj_transpose( Arow_k ), B.sub(0, 0, 0, B.nt()-1), beta, C.sub(1, C.mt()-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + // Clean up workspace + #pragma omp task depend( in:gemm[ 0 ] ) shared( B ) + { + auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 ); + B_col_0.releaseRemoteWorkspace(); + B_col_0.releaseLocalWorkspace(); + } + // Main loop for (int64_t k = 1; k < A.nt(); ++k) { @@ -500,13 +534,14 @@ void hemmA( alpha, A.sub(0, k-1, k, k), B.sub(k, k, 0, B.nt()-1), one, C.sub(0, k-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); internal::hemmA( Side::Left, alpha, A.sub(k, k), B.sub(k, k, 0, B.nt()-1), - one, C.sub(k, k, 0, C.nt()-1)); + one, C.sub(k, k, 0, C.nt()-1), + priority_0, opts2 ); if (A.nt()-1 > k) { auto Arow_k = A.sub(k, k, k+1, A.nt()-1); @@ -514,9 +549,17 @@ void hemmA( alpha, conj_transpose( Arow_k ), B.sub(k, k, 0, B.nt()-1), one, C.sub(k+1, C.mt()-1, 0, C.nt()-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + + // Clean up workspace + #pragma omp task depend( in:gemm[ k ] ) shared( B ) + { + auto B_col_k = B.sub( k, k, 0, B.nt()-1 ); + B_col_k.releaseRemoteWorkspace(); + B_col_k.releaseLocalWorkspace(); + } } #pragma omp task depend(in:gemm[A.nt()-1]) diff --git a/src/internal/internal_copyhb2st.cc b/src/internal/internal_copyhb2st.cc index c0bb8f589..6ca0f7fb7 100644 --- a/src/internal/internal_copyhb2st.cc +++ b/src/internal/internal_copyhb2st.cc @@ -65,7 +65,6 @@ void copyhb2st(internal::TargetType, auto T = A(i-1, i); E[E_index] = real( T(T.mb()-1, 0) ); E_index += 1; - A.tileTick(i-1, i); } // Copy main diagonal to D. @@ -82,7 +81,6 @@ void copyhb2st(internal::TargetType, E[E_index + j] = real( T(j, j+1) ); } E_index += len-1; - A.tileTick(i, i); } } diff --git a/src/internal/internal_copytb2bd.cc b/src/internal/internal_copytb2bd.cc index 4fe2c0f96..881cfcfe4 100644 --- a/src/internal/internal_copytb2bd.cc +++ b/src/internal/internal_copytb2bd.cc @@ -63,7 +63,6 @@ void copytb2bd(internal::TargetType, auto T = A(i-1, i); E[E_index] = real( T(T.mb()-1, 0) ); E_index += 1; - A.tileTick(i-1, i); } // Copy main diagonal to D. @@ -80,7 +79,6 @@ void copytb2bd(internal::TargetType, E[E_index + j] = real( T(j, j+1) ); } E_index += len-1; - A.tileTick(i, i); } } diff --git a/src/internal/internal_getrf_tntpiv.cc b/src/internal/internal_getrf_tntpiv.cc index a8249c697..9a79ebdff 100644 --- a/src/internal/internal_getrf_tntpiv.cc +++ b/src/internal/internal_getrf_tntpiv.cc @@ -608,7 +608,7 @@ void getrf_tntpiv_panel( aux_pivot[ 0 ], diag_len, A.mt(), mb ); } - Awork.tileTick( i2, 0 ); + Awork.tileRelease( i2, 0 ); } } else { diff --git a/src/internal/internal_unmtr_hb2st.cc b/src/internal/internal_unmtr_hb2st.cc index 0337e7bfb..be46ca147 100644 --- a/src/internal/internal_unmtr_hb2st.cc +++ b/src/internal/internal_unmtr_hb2st.cc @@ -441,7 +441,6 @@ void unmtr_hb2st( internal::TargetType, } } } - V.tileTick(0, r); } // if C(i, k) is local } // inner for loop @@ -451,11 +450,8 @@ void unmtr_hb2st( internal::TargetType, Vr_data[ii + ii*ldv] = tau[ii]; } } - if (target == Target::Devices) { - for (int d = 0; d < C.num_devices(); ++d) { - V_.tileRelease(0, r, d); - } - } + V.releaseLocalWorkspaceTile(0, r); + V.releaseRemoteWorkspaceTile(0, r); } } } // inner loop diff --git a/src/svd.cc b/src/svd.cc index 3f6ccb1a7..7b55d7c3c 100644 --- a/src/svd.cc +++ b/src/svd.cc @@ -252,6 +252,8 @@ void svd( // Copy diagonal and super-diagonal to vectors. internal::copytb2bd(Aband, Sigma, E); + + Aband.releaseRemoteWorkspace(); } int64_t ncvt = 0, nru = 0, ldvt = 1, ldu = 1; diff --git a/src/tbsmPivots.cc b/src/tbsmPivots.cc index 15df75fb4..258144418 100644 --- a/src/tbsmPivots.cc +++ b/src/tbsmPivots.cc @@ -35,12 +35,20 @@ void tbsm( using BcastList = typename Matrix::BcastList; // Assumes column major - const int priority_1 = 1; const Layout layout = Layout::ColMajor; + const int priority_0 = 0; + const int priority_1 = 1; + const int queue_0 = 0; // Options int64_t lookahead = get_option( opts, Option::Lookahead, 1 ); + // Use only TileReleaseStrategy::Slate for tbsmPivots. + // Internal tbsmPivots routine called here won't release + // any tiles. This routine will clean up tiles. + Options opts2 = opts; + opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate; + // if on right, change to left by (conj)-transposing A and B to get // op(B) = op(A)^{-1} * op(B) if (side == Side::Right) { @@ -136,7 +144,8 @@ void tbsm( internal::trsm( Side::Left, one, A.sub(k, k), - B.sub(k, k, 0, nt-1), 1); + B.sub(k, k, 0, nt-1), + priority_1, layout, queue_0, opts2 ); // send A(i=k+1:i_end-1, k) to ranks owning block row B(i, :) BcastList bcast_list_A; @@ -163,7 +172,7 @@ void tbsm( -one, A.sub(i, i, k, k), B.sub(k, k, 0, nt-1), one, B.sub(i, i, 0, nt-1), - layout, 1); + layout, priority_1, queue_0, opts2 ); } } @@ -181,9 +190,24 @@ void tbsm( -one, A.sub(k+1+lookahead, i_end-1, k, k), B.sub(k, k, 0, nt-1), one, B.sub(k+1+lookahead, i_end-1, 0, nt-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + + #pragma omp task depend(inout:row[k]) + { + auto A_panel = A.sub(k, i_end-1, k, k); + A_panel.releaseRemoteWorkspace(); + A_panel.releaseLocalWorkspace(); + + auto B_panel = B.sub(k, k, 0, nt-1); + B_panel.releaseRemoteWorkspace(); + + // Copy back modifications to tiles in the B panel + // before they are erased. + B_panel.tileUpdateAllOrigin(); + B_panel.releaseLocalWorkspace(); + } } } else if (pivots.empty()) { @@ -204,7 +228,8 @@ void tbsm( internal::trsm( Side::Left, one, A.sub(k, k), - B.sub(k, k, 0, nt-1), 1); + B.sub(k, k, 0, nt-1), + priority_1, layout, queue_0, opts2 ); // send A(i=k-kdt:k-1, k) to ranks owning block row B(i, :) BcastList bcast_list_A; @@ -228,7 +253,7 @@ void tbsm( -one, A.sub(i, i, k, k), B.sub(k, k, 0, nt-1), one, B.sub(i, i, 0, nt-1), - layout, 1); + layout, priority_1, queue_0, opts2 ); } } @@ -245,9 +270,24 @@ void tbsm( -one, A.sub(i_begin, k-1-lookahead, k, k), B.sub(k, k, 0, nt-1), one, B.sub(i_begin, k-1-lookahead, 0, nt-1), - layout); + layout, priority_0, queue_0, opts2 ); } } + + #pragma omp task depend(inout:row[k]) + { + auto A_panel = A.sub(i_begin, k, k, k); + A_panel.releaseRemoteWorkspace(); + A_panel.releaseLocalWorkspace(); + + auto B_panel = B.sub(k, k, 0, nt-1); + B_panel.releaseRemoteWorkspace(); + + // Copy back modifications to tiles in the B panel + // before they are erased. + B_panel.tileUpdateAllOrigin(); + B_panel.releaseLocalWorkspace(); + } } } else { @@ -259,15 +299,15 @@ void tbsm( // A = L^T, the RHS updates are organized differently than in // the no-pivoting case above. Due to dependencies, there is no // lookahead or top-level tasks, only the nested tasks inside - // internal routines. + // internal routines and a tile-release task. for (int64_t k = mt-1; k >= 0; --k) { + // A( k, k : k_end-1 ) is k-th row + // Typically, A is L^T, so the k-th row is the + // k-th panel (transposed) from gbtrf. + int64_t k_end = min(k + kdt + 1, A.nt()); + // update RHS { - // A( k, k : k_end-1 ) is k-th row - // Typically, A is L^T, so the k-th row is the - // k-th panel (transposed) from gbtrf. - int64_t k_end = min(k + kdt + 1, A.nt()); - for (int64_t i = k+1; i < k_end; ++i) { // send A(k, i) across to ranks owning B(k, :) A.template tileBcast(k, i, B.sub(k, k, 0, nt-1), layout); @@ -284,7 +324,7 @@ void tbsm( -one, A.sub(k, k, i, i), B.sub(i, i, 0, nt-1), one, B.sub(k, k, 0, nt-1), - layout, priority_1 ); + layout, priority_1, queue_0, opts2 ); } } @@ -297,7 +337,8 @@ void tbsm( internal::trsm( Side::Left, one, A.sub(k, k), - B.sub(k, k, 0, nt-1), 1); + B.sub(k, k, 0, nt-1), + priority_1, layout, queue_0, opts2 ); } // swap rows in B(k:mt-1, 0:nt-1) @@ -306,6 +347,23 @@ void tbsm( Direction::Backward, B.sub(k, B.mt()-1, 0, B.nt()-1), pivots.at(k), layout); } + + #pragma omp task shared( A, B ) firstprivate( k, k_end, nt ) + { + auto A_panel = A.sub( k, k, k, k_end-1 ); + A_panel.releaseRemoteWorkspace(); + A_panel.releaseLocalWorkspace(); + + if (k + kdt + 1 <= A.nt()) { + auto B_panel = B.sub( k_end-1, k_end-1, 0, nt-1 ); + B_panel.releaseRemoteWorkspace(); + + // Copy back modifications to tiles in the B panel + // before they are erased. + B_panel.tileUpdateAllOrigin(); + B_panel.releaseLocalWorkspace(); + } + } } } #pragma omp taskwait diff --git a/src/trmm.cc b/src/trmm.cc index 8eff71fd3..5bfced551 100644 --- a/src/trmm.cc +++ b/src/trmm.cc @@ -28,7 +28,7 @@ void trmm( if (target == Target::Devices) { const int64_t batch_size_default = 0; // use default batch size - const int num_queues = 2; // Number of kernels without lookahead + const int num_queues = 1; // Number of kernels without lookahead B.allocateBatchArrays( batch_size_default, num_queues ); B.reserveDeviceWorkspace(); } diff --git a/src/work/work_trmm.cc b/src/work/work_trmm.cc index 0dc65bfc6..3379aa55d 100644 --- a/src/work/work_trmm.cc +++ b/src/work/work_trmm.cc @@ -70,10 +70,14 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, const int priority_0 = 0; const int priority_1 = 1; const int queue_0 = 0; - const int queue_1 = 1; // Assumes column major const Layout layout = Layout::ColMajor; + // Use only TileReleaseStrategy::Slate for trmm. + // Internal routines (trmm and gemm) called here won't release + // any tiles. Trsm will clean up tiles. + Options opts2 = {{Option::TileReleaseStrategy, TileReleaseStrategy::Slate}}; + // if on right, change to left by (conj)-transposing A and B to get // op(B) = op(A)*op(B) if (side == Side::Right) { @@ -95,9 +99,9 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, int64_t mt = B.mt(); int64_t nt = B.nt(); - // Requires at least 2 queues + // Requires at least 1 queues if (target == Target::Devices) - assert(B.numComputeQueues() >= 2); + assert(B.numComputeQueues() >= 1); if (A.uplo() == Uplo::Upper) { // ---------------------------------------- @@ -112,11 +116,7 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, A.template tileBcast(0, 0, B.sub(0, 0, 0, nt-1), layout); // broadcast B(0, j) to ranks owning block col B(0:0, j) - // todo: nowhere to send? - BcastList bcast_list_B; - for (int64_t j = 0; j < nt; ++j) - bcast_list_B.push_back({0, j, {B.sub(0, 0, j, j)}}); - B.template listBcast(bcast_list_B, layout); + // nothing to send } // send next lookahead block cols of A and block rows of B @@ -146,8 +146,15 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, internal::trmm( Side::Left, alpha, A.sub(0, 0), - B.sub(0, 0, 0, nt-1), priority_1, queue_1); + B.sub(0, 0, 0, nt-1), + priority_1, queue_0, opts2 ); } + + #pragma omp task depend(in:gemm[0]) + { + A.sub(0, 0).releaseRemoteWorkspace(); + } + for (int64_t k = 1; k < mt; ++k) { // send next block col of A and block row of B @@ -188,13 +195,24 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, alpha, A.sub(0, k-1, k, k), B.sub(k, k, 0, nt-1), one, B.sub(0, k-1, 0, nt-1), - layout, priority_0, queue_0 ); + layout, priority_0, queue_0, opts2 ); internal::trmm( Side::Left, alpha, A.sub(k, k), B.sub(k, k, 0, nt-1), - priority_0, queue_1); + priority_0, queue_0, opts2 ); + } + + #pragma omp task depend(in:gemm[k]) + { + auto A_panel = A.sub(0, k, k, k); + A_panel.releaseRemoteWorkspace(); + A_panel.releaseLocalWorkspace(); + + auto B_panel = B.sub(k, k, 0, nt-1); + B_panel.releaseRemoteWorkspace(); + // Can't release local workspace of B since we continue to update it } } } @@ -212,13 +230,7 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, mt-1, mt-1, B.sub(mt-1, mt-1, 0, nt-1), layout); // broadcast B(m-1, j) to ranks owning block col B(m-1:m-1, j) - // todo: nowhere to send? - BcastList bcast_list_B; - for (int64_t j = 0; j < nt; ++j) { - bcast_list_B.push_back( - {mt-1, j, {B.sub(mt-1, mt-1, j, j)}}); - } - B.template listBcast(bcast_list_B, layout); + // nothing to send } // send next lookahead block cols of A and block rows of B @@ -248,7 +260,13 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, internal::trmm( Side::Left, alpha, A.sub(mt-1, mt-1), - B.sub(mt-1, mt-1, 0, nt-1), priority_1, queue_1); + B.sub(mt-1, mt-1, 0, nt-1), + priority_1, queue_0, opts2 ); + } + + #pragma omp task depend(in:gemm[mt-1]) + { + A.sub(mt-1, mt-1).releaseRemoteWorkspace(); } for (int64_t k = mt-2; k >= 0; --k) { @@ -291,14 +309,24 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix A, alpha, A.sub(k+1, mt-1, k, k), B.sub(k, k, 0, nt-1), one, B.sub(k+1, mt-1, 0, nt-1), - layout, priority_0, queue_0 ); + layout, priority_0, queue_0, opts2 ); - // todo: target? needs batch trmm internal::trmm( Side::Left, alpha, A.sub(k, k), B.sub(k, k, 0, nt-1), - priority_0, queue_1); + priority_0, queue_0, opts2 ); + } + + #pragma omp task depend(in:gemm[k]) + { + auto A_panel = A.sub(k, mt-1, k, k); + A_panel.releaseRemoteWorkspace(); + A_panel.releaseLocalWorkspace(); + + auto B_panel = B.sub(k, k, 0, nt-1); + B_panel.releaseRemoteWorkspace(); + // Can't release local workspace of B since we continue to update it } } } // end Lower/NoTrans diff --git a/src/work/work_trsm.cc b/src/work/work_trsm.cc index b61322ca9..70218d4ad 100644 --- a/src/work/work_trsm.cc +++ b/src/work/work_trsm.cc @@ -90,17 +90,15 @@ void trsm(Side side, scalar_t alpha, TriangularMatrix A, int64_t mt = B.mt(); int64_t nt = B.nt(); + // Use only TileReleaseStrategy::Slate for trsm. + // Internal routines (trsm and gemm) called here won't release + // any tiles. Trsm will clean up tiles. Options opts2 = opts; + opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate; // Requires 2+lookahead queues if (target == Target::Devices) { assert(B.numComputeQueues() >= 2+lookahead); - - // Use only TileReleaseStrategy::Slate for trsm. - // Internal routines (trsm and gemm) called here - // won't release any tiles. Trsm will - // clean up tiles. - opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate; } if (A.uplo() == Uplo::Lower) {