diff --git a/src/gemmA.cc b/src/gemmA.cc
index a2bdeae9d..67d9a8249 100644
--- a/src/gemmA.cc
+++ b/src/gemmA.cc
@@ -131,20 +131,19 @@ void gemmA(
             int tag_0 = 0;
             C.template listReduce( reduce_list_C, layout, tag_0 );
         }
-        // Clean the memory introduced by internal::gemmA on Devices
-        if (target == Target::Devices) {
-            #pragma omp task depend( in:gemmA[ 0 ] ) \
-                              shared( B, C )
-            {
-                auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 );
-                B_col_0.releaseRemoteWorkspace();
-                B_col_0.releaseLocalWorkspace();
-
-                auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 );
-                C_col_0.releaseRemoteWorkspace();
-                C_col_0.tileUpdateAllOrigin();
-                C_col_0.releaseLocalWorkspace();
-            }
+
+        // Clean up workspace
+        #pragma omp task depend( in:gemmA[ 0 ] ) \
+                          shared( B, C )
+        {
+            auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 );
+            B_col_0.releaseRemoteWorkspace();
+            B_col_0.releaseLocalWorkspace();
+
+            auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 );
+            C_col_0.releaseRemoteWorkspace();
+            C_col_0.tileUpdateAllOrigin();
+            C_col_0.releaseLocalWorkspace();
         }
 
         // broadcast (with lookahead) and multiply the rest of the columns
@@ -194,21 +193,20 @@ void gemmA(
                 int tag_k = k;
                 C.template listReduce( reduce_list_C, layout, tag_k );
             }
-            // Clean the memory introduced by internal::gemmA on Devices
-            if (target == Target::Devices) {
-                #pragma omp task depend( in:gemmA[ k ] ) \
-                                  shared( B, C ) \
-                                  firstprivate( k )
-                {
-                    auto B_col_k = B.sub( 0, B.mt()-1, k, k );
-                    B_col_k.releaseRemoteWorkspace();
-                    B_col_k.releaseLocalWorkspace();
-
-                    auto C_col_k = C.sub( 0, C.mt()-1, k, k );
-                    C_col_k.releaseRemoteWorkspace();
-                    C_col_k.tileUpdateAllOrigin();
-                    C_col_k.releaseLocalWorkspace();
-                }
+
+            // Clean up workspace
+            #pragma omp task depend( in:gemmA[ k ] ) \
+                              shared( B, C ) \
+                              firstprivate( k )
+            {
+                auto B_col_k = B.sub( 0, B.mt()-1, k, k );
+                B_col_k.releaseRemoteWorkspace();
+                B_col_k.releaseLocalWorkspace();
+
+                auto C_col_k = C.sub( 0, C.mt()-1, k, k );
+                C_col_k.releaseRemoteWorkspace();
+                C_col_k.tileUpdateAllOrigin();
+                C_col_k.releaseLocalWorkspace();
             }
         }
         #pragma omp taskwait
diff --git a/src/heev.cc b/src/heev.cc
index 34a4eb59e..a8f8894eb 100644
--- a/src/heev.cc
+++ b/src/heev.cc
@@ -135,6 +135,8 @@ void heev(
 
         // Copy diagonal and super-diagonal to vectors.
         internal::copyhb2st( Aband, Lambda, E );
+
+        Aband.releaseRemoteWorkspace();
     }
 
     // 3. Tri-diagonal eigenvalue solver.
diff --git a/src/hegst.cc b/src/hegst.cc
index 3a091a66f..03c4ae9f8 100644
--- a/src/hegst.cc
+++ b/src/hegst.cc
@@ -55,11 +55,11 @@ void hegst(
     uint8_t* column = column_vector.data();
 
     if (target == Target::Devices) {
-        // The work::trsm (itype=1) and work::trmm (itype=2,3)
-        // routines use 2 queues (queue 0,1). All other
-        // internal::routines here use the default queue (queue 0).
-        // So 2 queues need to be allocated.
-        A.allocateBatchArrays(0, 2+lookahead); // (batch size, num_queues)
+        // The work::trsm (itype=1) routine uses 2 queues (queue 0,1).
+        // The work::trmm (itype=2,3) routine uses 1 queue (queue 0).
+        // All other internal::routines here use the default queue (queue 0).
+        int64_t num_queues = (itype == 1) ? 2 : 1;
+        A.allocateBatchArrays(0, num_queues+lookahead); // (batch size, num_queues)
         A.reserveDeviceWorkspace();
     }
 
diff --git a/src/hemmA.cc b/src/hemmA.cc
index 4c450d159..aea49e1d1 100644
--- a/src/hemmA.cc
+++ b/src/hemmA.cc
@@ -37,6 +37,8 @@ void hemmA(
     using BcastList = typename Matrix<scalar_t>::BcastList;
 
     const scalar_t one = 1.0;
+    const int priority_0 = 0;
+    const int queue_0 = 0;
 
     // Assumes column major
     const Layout layout = Layout::ColMajor;
@@ -44,6 +46,12 @@ void hemmA(
     // Options
     int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
 
+    // Use only TileReleaseStrategy::Slate for hemmA.
+    // Internal routines (hemmA and gemmA) called here won't release
+    // any tiles. This routine will clean up tiles.
+    Options opts2 = opts;
+    opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;
+
     // if on right, change to left by transposing A, B, C to get
     // op(C) = op(A)*op(B)
     if (side == Side::Right) {
@@ -190,17 +198,26 @@ void hemmA(
                     Side::Left,
                     alpha, A.sub(0, 0),
                            B.sub(0, 0, 0, B.nt()-1),
-                    beta,  C.sub(0, 0, 0, C.nt()-1));
+                    beta,  C.sub(0, 0, 0, C.nt()-1),
+                    priority_0, opts2 );
 
                 if (A.mt()-1 > 0) {
                     internal::gemmA<target>(
                         alpha,  A.sub(1, A.mt()-1, 0, 0),
                                 B.sub(0, 0, 0, B.nt()-1),
                         beta,   C.sub(1, C.mt()-1, 0, C.nt()-1),
-                                layout);
+                        layout, priority_0, queue_0, opts2 );
                 }
             }
 
+            // Clean up workspace
+            #pragma omp task depend( in:gemm[ 0 ] ) shared( B )
+            {
+                auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 );
+                B_col_0.releaseRemoteWorkspace();
+                B_col_0.releaseLocalWorkspace();
+            }
+
             // Main loop
             for (int64_t k = 1; k < A.nt(); ++k) {
 
@@ -268,23 +285,31 @@ void hemmA(
                         alpha, conj_transpose( Arow_k ),
                                B.sub(k, k, 0, B.nt()-1),
                         one,   C.sub(0, k-1, 0, C.nt()-1),
-                        layout);
+                        layout, priority_0, queue_0, opts2 );
 
                     internal::hemmA<Target::HostTask>(
                         Side::Left,
                         alpha, A.sub(k, k),
                                B.sub(k, k, 0, B.nt()-1),
-                        one,   C.sub(k, k, 0, C.nt()-1));
+                        one,   C.sub(k, k, 0, C.nt()-1),
+                        priority_0, opts2 );
 
                     if (A.mt()-1 > k) {
                         internal::gemmA<target>(
                             alpha, A.sub(k+1, A.mt()-1, k, k),
                                    B.sub(k, k, 0, B.nt()-1),
                             one,   C.sub(k+1, C.mt()-1, 0, C.nt()-1),
-                            layout);
+                            layout, priority_0, queue_0, opts2 );
                     }
                 }
 
+                // Clean up workspace
+                #pragma omp task depend( in:gemm[ k ] ) shared( B )
+                {
+                    auto B_col_k = B.sub( k, k, 0, B.nt()-1 );
+                    B_col_k.releaseRemoteWorkspace();
+                    B_col_k.releaseLocalWorkspace();
+                }
             }
 
             #pragma omp task depend(in:gemm[A.nt()-1])
@@ -423,7 +448,8 @@ void hemmA(
                     Side::Left,
                     alpha, A.sub(0, 0),
                            B.sub(0, 0, 0, B.nt()-1),
-                    beta,  C.sub(0, 0, 0, C.nt()-1));
+                    beta,  C.sub(0, 0, 0, C.nt()-1),
+                    priority_0, opts2 );
 
                 if (A.mt()-1 > 0) {
                     auto Arow_k = A.sub(0, 0, 1, A.nt()-1);
@@ -431,10 +457,18 @@ void hemmA(
                         alpha, conj_transpose( Arow_k ),
                                B.sub(0, 0, 0, B.nt()-1),
                         beta,  C.sub(1, C.mt()-1, 0, C.nt()-1),
-                        layout);
+                        layout, priority_0, queue_0, opts2 );
                 }
             }
 
+            // Clean up workspace
+            #pragma omp task depend( in:gemm[ 0 ] ) shared( B )
+            {
+                auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 );
+                B_col_0.releaseRemoteWorkspace();
+                B_col_0.releaseLocalWorkspace();
+            }
+
             // Main loop
             for (int64_t k = 1; k < A.nt(); ++k) {
 
@@ -500,13 +534,14 @@ void hemmA(
                         alpha, A.sub(0, k-1, k, k),
                                B.sub(k, k, 0, B.nt()-1),
                         one,   C.sub(0, k-1, 0, C.nt()-1),
-                        layout);
+                        layout, priority_0, queue_0, opts2 );
 
                     internal::hemmA<Target::HostTask>(
                         Side::Left,
                         alpha, A.sub(k, k),
                                B.sub(k, k, 0, B.nt()-1),
-                        one,   C.sub(k, k, 0, C.nt()-1));
+                        one,   C.sub(k, k, 0, C.nt()-1),
+                        priority_0, opts2 );
 
                     if (A.nt()-1 > k) {
                         auto Arow_k = A.sub(k, k, k+1, A.nt()-1);
@@ -514,9 +549,17 @@ void hemmA(
                             alpha, conj_transpose( Arow_k ),
                                    B.sub(k, k, 0, B.nt()-1),
                             one,   C.sub(k+1, C.mt()-1, 0, C.nt()-1),
-                            layout);
+                            layout, priority_0, queue_0, opts2 );
                     }
                 }
+
+                // Clean up workspace
+                #pragma omp task depend( in:gemm[ k ] ) shared( B )
+                {
+                    auto B_col_k = B.sub( k, k, 0, B.nt()-1 );
+                    B_col_k.releaseRemoteWorkspace();
+                    B_col_k.releaseLocalWorkspace();
+                }
             }
 
             #pragma omp task depend(in:gemm[A.nt()-1])
diff --git a/src/internal/internal_copyhb2st.cc b/src/internal/internal_copyhb2st.cc
index c0bb8f589..6ca0f7fb7 100644
--- a/src/internal/internal_copyhb2st.cc
+++ b/src/internal/internal_copyhb2st.cc
@@ -65,7 +65,6 @@ void copyhb2st(internal::TargetType<Target::HostTask>,
             auto T = A(i-1, i);
             E[E_index] = real( T(T.mb()-1, 0) );
             E_index += 1;
-            A.tileTick(i-1, i);
         }
 
         // Copy main diagonal to D.
@@ -82,7 +81,6 @@ void copyhb2st(internal::TargetType<Target::HostTask>,
             E[E_index + j] = real( T(j, j+1) );
         }
         E_index += len-1;
-        A.tileTick(i, i);
     }
 }
 
diff --git a/src/internal/internal_copytb2bd.cc b/src/internal/internal_copytb2bd.cc
index 4fe2c0f96..881cfcfe4 100644
--- a/src/internal/internal_copytb2bd.cc
+++ b/src/internal/internal_copytb2bd.cc
@@ -63,7 +63,6 @@ void copytb2bd(internal::TargetType<Target::HostTask>,
             auto T = A(i-1, i);
             E[E_index] = real( T(T.mb()-1, 0) );
             E_index += 1;
-            A.tileTick(i-1, i);
         }
 
         // Copy main diagonal to D.
@@ -80,7 +79,6 @@ void copytb2bd(internal::TargetType<Target::HostTask>,
             E[E_index + j] = real( T(j, j+1) );
         }
         E_index += len-1;
-        A.tileTick(i, i);
     }
 }
 
diff --git a/src/internal/internal_getrf_tntpiv.cc b/src/internal/internal_getrf_tntpiv.cc
index a8249c697..9a79ebdff 100644
--- a/src/internal/internal_getrf_tntpiv.cc
+++ b/src/internal/internal_getrf_tntpiv.cc
@@ -608,7 +608,7 @@ void getrf_tntpiv_panel(
                                 aux_pivot[ 0 ], diag_len, A.mt(), mb );
                         }
 
-                        Awork.tileTick( i2, 0 );
+                        Awork.tileRelease( i2, 0 );
                     }
                 }
                 else {
diff --git a/src/internal/internal_unmtr_hb2st.cc b/src/internal/internal_unmtr_hb2st.cc
index 0337e7bfb..be46ca147 100644
--- a/src/internal/internal_unmtr_hb2st.cc
+++ b/src/internal/internal_unmtr_hb2st.cc
@@ -441,7 +441,6 @@ void unmtr_hb2st( internal::TargetType<target>,
                                     }
                                 }
                             }
-                            V.tileTick(0, r);
                         } // if C(i, k) is local
                     } // inner for loop
 
@@ -451,11 +450,8 @@ void unmtr_hb2st( internal::TargetType<target>,
                             Vr_data[ii + ii*ldv] = tau[ii];
                         }
                     }
-                    if (target == Target::Devices) {
-                        for (int d = 0; d < C.num_devices(); ++d) {
-                            V_.tileRelease(0, r, d);
-                        }
-                    }
+                    V.releaseLocalWorkspaceTile(0, r);
+                    V.releaseRemoteWorkspaceTile(0, r);
                 }
             }
         } // inner loop
diff --git a/src/svd.cc b/src/svd.cc
index 3f6ccb1a7..7b55d7c3c 100644
--- a/src/svd.cc
+++ b/src/svd.cc
@@ -252,6 +252,8 @@ void svd(
 
         // Copy diagonal and super-diagonal to vectors.
         internal::copytb2bd(Aband, Sigma, E);
+
+        Aband.releaseRemoteWorkspace();
     }
 
     int64_t ncvt = 0, nru = 0, ldvt = 1, ldu = 1;
diff --git a/src/tbsmPivots.cc b/src/tbsmPivots.cc
index 15df75fb4..258144418 100644
--- a/src/tbsmPivots.cc
+++ b/src/tbsmPivots.cc
@@ -35,12 +35,20 @@ void tbsm(
     using BcastList = typename Matrix<scalar_t>::BcastList;
 
     // Assumes column major
-    const int priority_1 = 1;
     const Layout layout = Layout::ColMajor;
+    const int priority_0 = 0;
+    const int priority_1 = 1;
+    const int queue_0 = 0;
 
     // Options
     int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );
 
+    // Use only TileReleaseStrategy::Slate for tbsmPivots.
+    // Internal tbsmPivots routine called here won't release
+    // any tiles. This routine will clean up tiles.
+    Options opts2 = opts;
+    opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;
+
     // if on right, change to left by (conj)-transposing A and B to get
     // op(B) = op(A)^{-1} * op(B)
     if (side == Side::Right) {
@@ -136,7 +144,8 @@ void tbsm(
                     internal::trsm<Target::HostTask>(
                         Side::Left,
                         one, A.sub(k, k),
-                             B.sub(k, k, 0, nt-1), 1);
+                             B.sub(k, k, 0, nt-1),
+                        priority_1, layout, queue_0, opts2 );
 
                     // send A(i=k+1:i_end-1, k) to ranks owning block row B(i, :)
                     BcastList bcast_list_A;
@@ -163,7 +172,7 @@ void tbsm(
                             -one, A.sub(i, i, k, k),
                                   B.sub(k, k, 0, nt-1),
                             one,  B.sub(i, i, 0, nt-1),
-                            layout, 1);
+                            layout, priority_1, queue_0, opts2 );
                     }
                 }
 
@@ -181,9 +190,24 @@ void tbsm(
                             -one, A.sub(k+1+lookahead, i_end-1, k, k),
                                   B.sub(k, k, 0, nt-1),
                             one,  B.sub(k+1+lookahead, i_end-1, 0, nt-1),
-                            layout);
+                            layout, priority_0, queue_0, opts2 );
                     }
                 }
+
+                #pragma omp task depend(inout:row[k])
+                {
+                    auto A_panel = A.sub(k, i_end-1, k, k);
+                    A_panel.releaseRemoteWorkspace();
+                    A_panel.releaseLocalWorkspace();
+
+                    auto B_panel = B.sub(k, k, 0, nt-1);
+                    B_panel.releaseRemoteWorkspace();
+
+                    // Copy back modifications to tiles in the B panel
+                    // before they are erased.
+                    B_panel.tileUpdateAllOrigin();
+                    B_panel.releaseLocalWorkspace();
+                }
             }
         }
         else if (pivots.empty()) {
@@ -204,7 +228,8 @@ void tbsm(
                     internal::trsm<Target::HostTask>(
                         Side::Left,
                         one, A.sub(k, k),
-                             B.sub(k, k, 0, nt-1), 1);
+                             B.sub(k, k, 0, nt-1),
+                        priority_1, layout, queue_0, opts2 );
 
                     // send A(i=k-kdt:k-1, k) to ranks owning block row B(i, :)
                     BcastList bcast_list_A;
@@ -228,7 +253,7 @@ void tbsm(
                             -one, A.sub(i, i, k, k),
                                   B.sub(k, k, 0, nt-1),
                             one,  B.sub(i, i, 0, nt-1),
-                            layout, 1);
+                            layout, priority_1, queue_0, opts2 );
                     }
                 }
 
@@ -245,9 +270,24 @@ void tbsm(
                             -one, A.sub(i_begin, k-1-lookahead, k, k),
                                   B.sub(k, k, 0, nt-1),
                             one,  B.sub(i_begin, k-1-lookahead, 0, nt-1),
-                            layout);
+                            layout, priority_0, queue_0, opts2 );
                     }
                 }
+
+                #pragma omp task depend(inout:row[k])
+                {
+                    auto A_panel = A.sub(i_begin, k, k, k);
+                    A_panel.releaseRemoteWorkspace();
+                    A_panel.releaseLocalWorkspace();
+
+                    auto B_panel = B.sub(k, k, 0, nt-1);
+                    B_panel.releaseRemoteWorkspace();
+
+                    // Copy back modifications to tiles in the B panel
+                    // before they are erased.
+                    B_panel.tileUpdateAllOrigin();
+                    B_panel.releaseLocalWorkspace();
+                }
             }
         }
         else {
@@ -259,15 +299,15 @@ void tbsm(
             // A = L^T, the RHS updates are organized differently than in
             // the no-pivoting case above. Due to dependencies, there is no
             // lookahead or top-level tasks, only the nested tasks inside
-            // internal routines.
+            // internal routines and a tile-release task.
             for (int64_t k = mt-1; k >= 0; --k) {
+                // A( k, k : k_end-1 ) is k-th row
+                // Typically, A is L^T, so the k-th row is the
+                // k-th panel (transposed) from gbtrf.
+                int64_t k_end = min(k + kdt + 1, A.nt());
+
                 // update RHS
                 {
-                    // A( k, k : k_end-1 ) is k-th row
-                    // Typically, A is L^T, so the k-th row is the
-                    // k-th panel (transposed) from gbtrf.
-                    int64_t k_end = min(k + kdt + 1, A.nt());
-
                     for (int64_t i = k+1; i < k_end; ++i) {
                         // send A(k, i) across to ranks owning B(k, :)
                         A.template tileBcast<target>(k, i, B.sub(k, k, 0, nt-1), layout);
@@ -284,7 +324,7 @@ void tbsm(
                                     -one, A.sub(k, k, i, i),
                                           B.sub(i, i, 0, nt-1),
                                     one,  B.sub(k, k, 0, nt-1),
-                                    layout, priority_1 );
+                                    layout, priority_1, queue_0, opts2 );
                     }
                 }
 
@@ -297,7 +337,8 @@ void tbsm(
                     internal::trsm<Target::HostTask>(
                         Side::Left,
                         one, A.sub(k, k),
-                             B.sub(k, k, 0, nt-1), 1);
+                             B.sub(k, k, 0, nt-1),
+                        priority_1, layout, queue_0, opts2 );
                 }
 
                 // swap rows in B(k:mt-1, 0:nt-1)
@@ -306,6 +347,23 @@ void tbsm(
                         Direction::Backward, B.sub(k, B.mt()-1, 0, B.nt()-1),
                         pivots.at(k), layout);
                 }
+
+                #pragma omp task shared( A, B ) firstprivate( k, k_end, nt )
+                {
+                    auto A_panel = A.sub( k, k, k, k_end-1 );
+                    A_panel.releaseRemoteWorkspace();
+                    A_panel.releaseLocalWorkspace();
+
+                    if (k + kdt + 1 <= A.nt()) {
+                        auto B_panel = B.sub( k_end-1, k_end-1, 0, nt-1 );
+                        B_panel.releaseRemoteWorkspace();
+
+                        // Copy back modifications to tiles in the B panel
+                        // before they are erased.
+                        B_panel.tileUpdateAllOrigin();
+                        B_panel.releaseLocalWorkspace();
+                    }
+                }
             }
         }
         #pragma omp taskwait
diff --git a/src/trmm.cc b/src/trmm.cc
index 8eff71fd3..5bfced551 100644
--- a/src/trmm.cc
+++ b/src/trmm.cc
@@ -28,7 +28,7 @@ void trmm(
 
     if (target == Target::Devices) {
         const int64_t batch_size_default = 0; // use default batch size
-        const int num_queues = 2; // Number of kernels without lookahead
+        const int num_queues = 1; // Number of kernels without lookahead
         B.allocateBatchArrays( batch_size_default, num_queues );
         B.reserveDeviceWorkspace();
     }
diff --git a/src/work/work_trmm.cc b/src/work/work_trmm.cc
index 0dc65bfc6..3379aa55d 100644
--- a/src/work/work_trmm.cc
+++ b/src/work/work_trmm.cc
@@ -70,10 +70,14 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
     const int priority_0 = 0;
     const int priority_1 = 1;
     const int queue_0 = 0;
-    const int queue_1 = 1;
     // Assumes column major
     const Layout layout = Layout::ColMajor;
 
+    // Use only TileReleaseStrategy::Slate for trmm.
+    // Internal routines (trmm and gemm) called here won't release
+    // any tiles. Trsm will clean up tiles.
+    Options opts2 = {{Option::TileReleaseStrategy, TileReleaseStrategy::Slate}};
+
     // if on right, change to left by (conj)-transposing A and B to get
     // op(B) = op(A)*op(B)
     if (side == Side::Right) {
@@ -95,9 +99,9 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
     int64_t mt = B.mt();
     int64_t nt = B.nt();
 
-    // Requires at least 2 queues
+    // Requires at least 1 queues
     if (target == Target::Devices)
-        assert(B.numComputeQueues() >= 2);
+        assert(B.numComputeQueues() >= 1);
 
     if (A.uplo() == Uplo::Upper) {
         // ----------------------------------------
@@ -112,11 +116,7 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
             A.template tileBcast<target>(0, 0, B.sub(0, 0, 0, nt-1), layout);
 
             // broadcast B(0, j) to ranks owning block col B(0:0, j)
-            // todo: nowhere to send?
-            BcastList bcast_list_B;
-            for (int64_t j = 0; j < nt; ++j)
-                bcast_list_B.push_back({0, j, {B.sub(0, 0, j, j)}});
-            B.template listBcast<target>(bcast_list_B, layout);
+            // nothing to send
         }
 
         // send next lookahead block cols of A and block rows of B
@@ -146,8 +146,15 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
             internal::trmm<target>(
                 Side::Left,
                 alpha, A.sub(0, 0),
-                       B.sub(0, 0, 0, nt-1), priority_1, queue_1);
+                       B.sub(0, 0, 0, nt-1),
+                priority_1, queue_0, opts2 );
         }
+
+        #pragma omp task depend(in:gemm[0])
+        {
+            A.sub(0, 0).releaseRemoteWorkspace();
+        }
+
         for (int64_t k = 1; k < mt; ++k) {
 
             // send next block col of A and block row of B
@@ -188,13 +195,24 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
                     alpha, A.sub(0, k-1, k, k),
                            B.sub(k, k, 0, nt-1),
                     one,   B.sub(0, k-1, 0, nt-1),
-                    layout, priority_0, queue_0 );
+                    layout, priority_0, queue_0, opts2 );
 
                 internal::trmm<target>(
                     Side::Left,
                     alpha, A.sub(k, k),
                            B.sub(k, k, 0, nt-1),
-                    priority_0, queue_1);
+                    priority_0, queue_0, opts2 );
+            }
+
+            #pragma omp task depend(in:gemm[k])
+            {
+                auto A_panel = A.sub(0, k, k, k);
+                A_panel.releaseRemoteWorkspace();
+                A_panel.releaseLocalWorkspace();
+
+                auto B_panel = B.sub(k, k, 0, nt-1);
+                B_panel.releaseRemoteWorkspace();
+                // Can't release local workspace of B since we continue to update it
             }
         }
     }
@@ -212,13 +230,7 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
                 mt-1, mt-1, B.sub(mt-1, mt-1, 0, nt-1), layout);
 
             // broadcast B(m-1, j) to ranks owning block col B(m-1:m-1, j)
-            // todo: nowhere to send?
-            BcastList bcast_list_B;
-            for (int64_t j = 0; j < nt; ++j) {
-                bcast_list_B.push_back(
-                    {mt-1, j, {B.sub(mt-1, mt-1, j, j)}});
-            }
-            B.template listBcast<target>(bcast_list_B, layout);
+            // nothing to send
         }
 
         // send next lookahead block cols of A and block rows of B
@@ -248,7 +260,13 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
             internal::trmm<target>(
                 Side::Left,
                 alpha, A.sub(mt-1, mt-1),
-                       B.sub(mt-1, mt-1, 0, nt-1), priority_1, queue_1);
+                       B.sub(mt-1, mt-1, 0, nt-1),
+                priority_1, queue_0, opts2 );
+        }
+
+        #pragma omp task depend(in:gemm[mt-1])
+        {
+            A.sub(mt-1, mt-1).releaseRemoteWorkspace();
         }
 
         for (int64_t k = mt-2; k >= 0; --k) {
@@ -291,14 +309,24 @@ void trmm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
                     alpha, A.sub(k+1, mt-1, k, k),
                            B.sub(k, k, 0, nt-1),
                     one,   B.sub(k+1, mt-1, 0, nt-1),
-                    layout, priority_0, queue_0 );
+                    layout, priority_0, queue_0, opts2 );
 
-                // todo: target? needs batch trmm
                 internal::trmm<target>(
                     Side::Left,
                     alpha, A.sub(k, k),
                            B.sub(k, k, 0, nt-1),
-                    priority_0, queue_1);
+                    priority_0, queue_0, opts2 );
+            }
+
+            #pragma omp task depend(in:gemm[k])
+            {
+                auto A_panel = A.sub(k, mt-1, k, k);
+                A_panel.releaseRemoteWorkspace();
+                A_panel.releaseLocalWorkspace();
+
+                auto B_panel = B.sub(k, k, 0, nt-1);
+                B_panel.releaseRemoteWorkspace();
+                // Can't release local workspace of B since we continue to update it
             }
         }
     } // end Lower/NoTrans
diff --git a/src/work/work_trsm.cc b/src/work/work_trsm.cc
index b61322ca9..70218d4ad 100644
--- a/src/work/work_trsm.cc
+++ b/src/work/work_trsm.cc
@@ -90,17 +90,15 @@ void trsm(Side side, scalar_t alpha, TriangularMatrix<scalar_t> A,
     int64_t mt = B.mt();
     int64_t nt = B.nt();
 
+    // Use only TileReleaseStrategy::Slate for trsm.
+    // Internal routines (trsm and gemm) called here won't release
+    // any tiles. Trsm will clean up tiles.
     Options opts2 = opts;
+    opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;
 
     // Requires 2+lookahead queues
     if (target == Target::Devices) {
         assert(B.numComputeQueues() >= 2+lookahead);
-
-        // Use only TileReleaseStrategy::Slate for trsm.
-        // Internal routines (trsm and gemm) called here
-        // won't release any tiles. Trsm will
-        // clean up tiles.
-        opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;
     }
 
     if (A.uplo() == Uplo::Lower) {