diff --git a/RandBLAS/base.hh b/RandBLAS/base.hh
index a2382e80..9e37b57f 100644
--- a/RandBLAS/base.hh
+++ b/RandBLAS/base.hh
@@ -118,6 +118,23 @@ template<typename T>
 concept SignedInteger = (std::numeric_limits<T>::is_signed && std::numeric_limits<T>::is_integer);
 
 
+template <SignedInteger TI, SignedInteger TO = int64_t>
+inline TO safe_int_product(TI a, TI b) {
+    if (a == 0 || b == 0) {
+        return 0;
+    }
+    TO c = a * b;
+    TO b_check = c / a;
+    TO a_check = c / b;
+    if ((a_check != a) || (b_check != b)) {
+        std::stringstream s;
+        s << "Overflow when multiplying a (=" << a << ") and b(=" << b << "), which resulted in " << c << ".\n";
+        throw std::overflow_error(s.str());
+    }
+    return c;
+}
+
+
 enum class MajorAxis : char {
     // ---------------------------------------------------------------------------
     ///  short-axis vectors (cols of a wide matrix, rows of a tall matrix)
@@ -125,7 +142,11 @@ enum class MajorAxis : char {
 
     // ---------------------------------------------------------------------------
     ///  long-axis vectors (rows of a wide matrix, cols of a tall matrix)
-    Long = 'L'
+    Long = 'L',
+
+    // ---------------------------------------------------------------------------
+    ///  Undefined (used when row-major vs column-major must be explicit)
+    Undefined = 'U'
 };
 
 
@@ -134,28 +155,38 @@ enum class MajorAxis : char {
  * the counter and the key. The arrays' types are statically sized, small
  * (typically of length 2 or 4), and can be distinct from one another.
  * 
- * @tparam RNG A CBRNG type in defined in Random123. We've found that Philox-based
- * CBRNGs work best for our purposes. Strictly speaking, we allow all Random123 CBRNGs
- * besides those based on AES.
+ * The template parameter RNG is a CBRNG type in defined in Random123. We've found
+ * that Philox-based CBRNGs work best for our purposes, but we also support Threefry-based CBRNGS.
  */
 template <typename RNG = r123::Philox4x32>
 struct RNGState
 {
     using generator = RNG;
-    // The unsigned integer type used in this RNGState's counter array.
-    using ctr_uint_type = typename RNG::ctr_type::value_type;
-    /// @brief The unsigned integer type used in this RNGState's key array.
-    ///        This is typically std::uint32_t, but it can be std::uint64_t.
-    using key_uint_type = typename RNG::key_type::value_type;
-    // An array type defined in Random123.
+    
     using ctr_type = typename RNG::ctr_type;
-    // An array type defined in Random123.
+    // ^ An array type defined in Random123.
     using key_type = typename RNG::key_type;
+    // ^ An array type defined in Random123.
+    using ctr_uint = typename RNG::ctr_type::value_type;
+    // ^ The unsigned integer type used in this RNGState's counter array.
+
+    /// -------------------------------------------------------------------
+    /// @brief The unsigned integer type used in this RNGState's key array.
+    ///        This is typically std::uint32_t, but it can be std::uint64_t.
+    using key_uint = typename RNG::key_type::value_type;
+
 
     const static int len_c = RNG::ctr_type::static_size;
     const static int len_k = RNG::key_type::static_size;
-    typename RNG::ctr_type counter; ///< This RNGState's counter array.
-    typename RNG::key_type key;     ///< This RNGState's key array.
+    typename RNG::ctr_type counter;
+    // ^ This RNGState's counter array.
+
+    /// ------------------------------------------------------------------
+    /// This RNGState's key array. If you want to manually advance the key
+    /// by an integer increment of size "step," then you do so by calling 
+    /// this->key.incr(step).
+    typename RNG::key_type key;
+
 
     /// Initialize the counter and key arrays to all zeros.
     RNGState() : counter{{0}}, key(key_type{{}}) {}
@@ -171,7 +202,7 @@ struct RNGState
 
     /// Initialize the counter array to all zeros. Initialize the key array to have first
     /// element equal to k and all other elements equal to zero.
-    RNGState(key_uint_type k) : counter{{0}}, key{{k}} {}
+    RNGState(key_uint k) : counter{{0}}, key{{k}} {}
 
     ~RNGState() {};
 
@@ -187,16 +218,16 @@ template <typename RNG>
 RNGState<RNG>::RNGState(
     const RNGState<RNG> &s
 ) {
-    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint_type));
-    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint_type));
+    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint));
+    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint));
 }
 
 template <typename RNG>
 RNGState<RNG> &RNGState<RNG>::operator=(
     const RNGState &s
 ) {
-    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint_type));
-    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint_type));
+    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint));
+    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint));
     return *this;
 }
 
@@ -219,4 +250,45 @@ std::ostream &operator<<(
     return out;
 }
 
+// =============================================================================
+/// @verbatim embed:rst:leading-slashes
+///
+/// .. NOTE: \ttt expands to \texttt (its definition is given in an rst file)
+///
+/// Words. Hello!
+///
+/// @endverbatim
+template<typename SkDist>
+concept SketchingDistribution = requires(SkDist D) {
+    { D.n_rows }     -> std::convertible_to<const int64_t>;
+    { D.n_cols }     -> std::convertible_to<const int64_t>;
+    { D.major_axis } -> std::convertible_to<const MajorAxis>;
+};
+
+// =============================================================================
+/// \fn isometry_scale_factor(SkDist D) 
+/// @verbatim embed:rst:leading-slashes
+/// Words here ...
+/// @endverbatim
+template <typename T, SketchingDistribution SkDist>
+inline T isometry_scale_factor(SkDist D);
+
+
+// =============================================================================
+/// @verbatim embed:rst:leading-slashes
+///
+/// .. NOTE: \ttt expands to \texttt (its definition is given in an rst file)
+///
+/// Words. Hello!
+///
+/// @endverbatim
+template<typename SkOp>
+concept SketchingOperator = requires(SkOp S) {
+    { S.n_rows }     -> std::convertible_to<const int64_t>;
+    { S.n_cols }     -> std::convertible_to<const int64_t>;
+    { S.seed_state } -> std::convertible_to<const typename SkOp::state_t>;
+    { S.seed_state } -> std::convertible_to<const typename SkOp::state_t>;
+};
+
+
 } // end namespace RandBLAS::base
diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index ac92ecbd..5cef11f1 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -31,6 +31,7 @@
 #include "RandBLAS/base.hh"
 #include "RandBLAS/exceptions.hh"
 #include "RandBLAS/random_gen.hh"
+#include "RandBLAS/util.hh"
 
 #include <blas.hh>
 
@@ -46,18 +47,11 @@
 
 namespace RandBLAS::dense {
 
-template<typename RNG>
-bool compare_ctr(typename RNG::ctr_type c1, typename RNG::ctr_type c2) {
-    int len = c1.size();
-    
-    for (int ind = len - 1; ind >= 0; ind--) {
-        if (c1[ind] > c2[ind]) {
-            return true;
-        } else if (c1[ind] < c2[ind]) {
-            return false;
-        }
-    }
-    return false;
+template <typename T_IN, typename T_OUT>
+inline void copy_promote(int n, const T_IN &a, T_OUT* b) {
+    for (int i = 0; i < n; ++i)
+        b[i] = static_cast<T_OUT>(a[i]);
+    return;
 }
 
 /** 
@@ -100,15 +94,7 @@ bool compare_ctr(typename RNG::ctr_type c1, typename RNG::ctr_type c2) {
  * 
  */
 template<typename T, typename RNG, typename OP>
-static RNGState<RNG> fill_dense_submat_impl(
-    int64_t n_cols,
-    T* smat,
-    int64_t n_srows,
-    int64_t n_scols,
-    int64_t ptr,
-    const RNGState<RNG> & seed,
-    int64_t lda = 0
-) {
+static RNGState<RNG> fill_dense_submat_impl(int64_t n_cols, T* smat, int64_t n_srows, int64_t n_scols, int64_t ptr, const RNGState<RNG> &seed, int64_t lda = 0) {
     if (lda <= 0) {
         lda = n_scols;
     } else {
@@ -118,112 +104,92 @@ static RNGState<RNG> fill_dense_submat_impl(
     RNG rng;
     using CTR_t = typename RNG::ctr_type;
     using KEY_t = typename RNG::key_type;
-    CTR_t c = seed.counter;
-    KEY_t k = seed.key;
+    const int64_t ctr_size = CTR_t::static_size;
     
     int64_t pad = 0;
-    // ^ computed such that  n_cols+pad is divisible by RNG::static_size
-    if (n_cols % CTR_t::static_size != 0) {
-        pad = CTR_t::static_size - n_cols % CTR_t::static_size;
+    // ^ computed such that n_cols+pad is divisible by ctr_size
+    if (n_cols % ctr_size != 0) {
+        pad = ctr_size - n_cols % ctr_size;
     }
-
-    int64_t n_cols_padded = n_cols + pad;
-    // ^ smallest number of columns, greater than or equal to n_cols, that would be divisible by CTR_t::static_size 
-    int64_t ptr_padded = ptr + ptr / n_cols * pad;
+    
+    const int64_t ptr_padded = ptr + ptr / n_cols * pad;
     // ^ ptr corresponding to the padded matrix
-    int64_t r0_padded = ptr_padded / CTR_t::static_size;
-    // ^ starting counter corresponding to ptr_padded 
-    int64_t r1_padded = (ptr_padded + n_scols - 1) / CTR_t::static_size;
-    // ^ ending counter corresponding to ptr of the last element of the row
-    int64_t ctr_gap = n_cols_padded / CTR_t::static_size; 
-    // ^ number of counters between the first counter of the row to the first counter of the next row;
-    int64_t s0 = ptr_padded % CTR_t::static_size; 
-    int64_t e1 = (ptr_padded + n_scols - 1) % CTR_t::static_size;
-
-    int64_t num_thrds = 1;
-    #if defined(RandBLAS_HAS_OpenMP)
-    #pragma omp parallel 
+    const int64_t ctr_mat_start     = ptr_padded / ctr_size;
+    const int64_t first_block_start = ptr_padded % ctr_size;
+    // ^ counter and [position within the counter's array] for index "ptr_padded".
+    const int64_t ctr_mat_row_end =  (ptr_padded + n_scols - 1) / ctr_size;
+    const int64_t last_block_stop = ((ptr_padded + n_scols - 1) % ctr_size) + 1;
+    // ^ counter and [1 + position within the counter's array] for index "(ptr_padded + n_scols - 1)".
+    const int64_t ctr_inter_row_stride = (n_cols + pad) / ctr_size;
+    // ^ number of counters between the first counter of a given row to the first counter of the next row;
+    const bool  one_block_per_row = ctr_mat_start == ctr_mat_row_end;
+    const int64_t first_block_len = ((one_block_per_row) ? last_block_stop : ctr_size) - first_block_start;
+
+    CTR_t temp_c = seed.counter;
+    temp_c.incr(ctr_mat_start);
+    const CTR_t c = temp_c;
+    const KEY_t k = seed.key;
+
+    #pragma omp parallel
     {
-        num_thrds = omp_get_num_threads();
-    }
-    #endif
-
-    //Instead of using thrd_arr just initialize ctr_arr to be zero counters;
-    CTR_t *ctr_arr = new CTR_t[num_thrds];
-    for (int i = 0; i < num_thrds; i++) {
-        ctr_arr[i] = c;
-    }
+    #pragma omp for schedule(static)
+    for (int64_t row = 0; row < n_srows; row++) {
 
-    #pragma omp parallel firstprivate(c, k)
-    {
+        int64_t incr_from_c = safe_int_product(ctr_inter_row_stride, row);
+    
+        auto c_row = c;
+        c_row.incr(incr_from_c);
+        auto rv = OP::generate(rng, c_row, k);
 
-    auto cc = c;
-    int64_t prev = 0;
-    int64_t i;
-    int64_t r0, r1;
-    int64_t ind;
-    int64_t thrd = 0;
-
-    #pragma omp for
-    for (int row = 0; row < n_srows; row++) {
-        
-        #if defined(RandBLAS_HAS_OpenMP)
-            thrd = omp_get_thread_num();
-        #endif
-
-        ind = 0;
-        r0 = r0_padded + ctr_gap*row;
-        r1 = r1_padded + ctr_gap*row; 
-
-        cc.incr(r0 - prev);
-        prev = r0;
-        auto rv =  OP::generate(rng, cc, k);
-        int64_t range = (r1 > r0)? CTR_t::static_size - 1 : e1;
-        for (i = s0; i <= range; i++) {
-            smat[ind + row * lda] = rv[i];
-            ind++;
+        T* smat_row = smat + row*lda;
+        for (int i = 0; i < first_block_len; i++) {
+            smat_row[i] = rv[i+first_block_start];
         }
-        // middle 
-        int64_t tmp = r0;
-        while( tmp < r1 - 1) {
-            cc.incr();
-            prev++;
-            rv = OP::generate(rng, cc, k);
-            for (i = 0; i < CTR_t::static_size; i++) {
-                smat[ind + row * lda] = rv[i];
-                ind++;
-            }
-            tmp++;
+        if ( one_block_per_row ) {
+            continue;
         }
-
-        // end
-        if ( r1 > r0 ) {
-            cc.incr();
-            prev++;
-            rv = OP::generate(rng, cc, k);
-            for (i = 0; i <= e1; i++) {
-                smat[ind + row * lda] = rv[i];
-                ind++;
-            }
+        // middle blocks
+        int64_t ind = first_block_len;
+        for (int i = 0; i < (ctr_mat_row_end - ctr_mat_start - 1); ++i) {
+            c_row.incr();
+            rv = OP::generate(rng, c_row, k);
+            copy_promote(ctr_size, rv, smat_row + ind);
+            ind = ind + ctr_size;
         }
-        ctr_arr[thrd] = cc;
+        // last block
+        c_row.incr();
+        rv = OP::generate(rng, c_row, k);
+        copy_promote(last_block_stop, rv, smat_row + ind);
     }
-
     }
     
-    //finds the largest counter in the counter array
-    CTR_t max_c = ctr_arr[0];
-    for (int i = 1; i < num_thrds; i++) {  
-        if (compare_ctr<RNG>(ctr_arr[i], max_c)) {
-            max_c = ctr_arr[i];
-        }
-    }
-    delete [] ctr_arr;
-
-    max_c.incr();
+    // find the largest counter in the counter array
+    CTR_t max_c = c;
+    max_c.incr(n_srows * ctr_inter_row_stride);
     return RNGState<RNG> {max_c, k};
 }
 
+template <typename RNG, typename DD>
+RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
+    if (dist.major_axis == MajorAxis::Undefined) {
+        // implies dist.family = DenseDistName::BlackBox
+        return state;
+    }
+    // ^ This is the only place where MajorAxis is actually used to some 
+    //   productive end.
+    int64_t major_len = major_axis_length(dist);
+    int64_t minor_len = dist.n_rows + (dist.n_cols - major_len);
+    int64_t ctr_size = RNG::ctr_type::static_size;
+    int64_t pad = 0;
+    if (major_len % ctr_size != 0) {
+        pad = ctr_size - major_len % ctr_size;
+    }
+    int64_t ctr_major_axis_stride = (major_len + pad) / ctr_size;
+    int64_t full_incr = safe_int_product(ctr_major_axis_stride, minor_len);
+    state.counter.incr(full_incr);
+    return state;
+}
+
 } // end namespace RandBLAS::dense
 
 
@@ -237,11 +203,12 @@ namespace RandBLAS {
 /// that can be used in GEMM.
 enum class DenseDistName : char {
     // ---------------------------------------------------------------------------
-    ///  Indicates the Gaussian distribution with mean 0 and standard deviation 1.
+    ///  Indicates the Gaussian distribution with mean 0 and variance 1.
     Gaussian = 'G',
 
     // ---------------------------------------------------------------------------
-    ///  Indicates the uniform distribution over [-1, 1].
+    ///  Indicates the uniform distribution over [-r, r] where r := sqrt(3)
+    ///  is the radius that provides for a variance of 1.
     Uniform = 'U',
 
     // ---------------------------------------------------------------------------
@@ -250,7 +217,6 @@ enum class DenseDistName : char {
     BlackBox = 'B'
 };
 
-
 // =============================================================================
 /// A distribution over dense sketching operators.
 struct DenseDist {
@@ -267,14 +233,10 @@ struct DenseDist {
     const DenseDistName family;
 
     // ---------------------------------------------------------------------------
-    ///  This member indirectly sets the storage order of buffers of
-    ///  sketching operators that are sampled from this distribution.
-    ///
-    ///  We note that the storage order of a DenseSkOp's underlying buffer does not
-    ///  affect whether the operator can be applied to row-major or column-major data.
-    ///  Mismatched data layouts are resolved automatically and 
-    ///  with zero copies inside RandBLAS::sketch_general. Therefore most users need
-    ///  not spend any brain power thinking about how this value should be set.
+    ///  This member affects whether samples from this distribution have their
+    ///  entries filled row-wise or column-wise. While there is no statistical 
+    ///  difference between these two filling orders, there are situations
+    ///  where one order or the other might be preferred.
     ///  
     /// @verbatim embed:rst:leading-slashes
     /// .. dropdown:: *Notes for experts*
@@ -285,10 +247,9 @@ struct DenseDist {
     ///     a larger sketching operator and (2) one of larger operator's dimensions
     ///     cannot be known before the  iterative process starts.
     ///
-    ///     Essentially, column-major storage order lets us
-    ///     stack operators horizontally in a consistent way, while row-major storage order
-    ///     lets us stack operators vertically in a consistent way. The mapping from
-    ///     major_axis to storage order is given in the table below.
+    ///     Essentially, a column-wise fill order lets us stack operators horizontally
+    ///     in a consistent way, while row-wise fill order lets us stack vertically
+    ///     in a consistent way. The mapping from major_axis to fill order is given below.
     /// 
     ///        .. list-table::
     ///           :widths: 34 33 33
@@ -298,40 +259,43 @@ struct DenseDist {
     ///             - :math:`\texttt{major_axis} = \texttt{Long}`
     ///             - :math:`\texttt{major_axis} = \texttt{Short}`
     ///           * - :math:`\texttt{n_rows} > \texttt{n_cols}`
-    ///             - column major
-    ///             - row major
+    ///             - column-wise
+    ///             - row-wise
     ///           * - :math:`\texttt{n_rows} \leq \texttt{n_cols}`
-    ///             - row major
-    ///             - column major
+    ///             - row-wise
+    ///             - column-wise
     /// @endverbatim
     const MajorAxis major_axis;
 
     // ---------------------------------------------------------------------------
     ///  A distribution over matrices of shape (n_rows, n_cols) with entries drawn
-    ///  iid from either the standard normal distribution or the uniform distribution
-    ///  over [-1, 1]. 
+    ///  iid from either the default choice of standard normal distribution, or from
+    ///  the uniform distribution over [-r, r], where r := sqrt(3) provides for
+    ///  unit variance.
     DenseDist(
         int64_t n_rows,
         int64_t n_cols,
-        DenseDistName dn = DenseDistName::Uniform
-    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(MajorAxis::Long) {
-        randblas_require(dn != DenseDistName::BlackBox);
-    };
+        DenseDistName dn = DenseDistName::Gaussian
+    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis( (dn == DenseDistName::BlackBox) ? MajorAxis::Undefined : MajorAxis::Long) { }
 
-    // Only use with struct initializer.
     DenseDist(
         int64_t n_rows,
         int64_t n_cols,
         DenseDistName dn,
         MajorAxis ma
-    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(ma) { };
+    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(ma) {
+        if (dn == DenseDistName::BlackBox) {
+            randblas_require(ma == MajorAxis::Undefined);
+        } else {
+            randblas_require(ma != MajorAxis::Undefined);
+        }  
+    }
 
 };
 
 
-inline blas::Layout dist_to_layout(
-    DenseDist D
-) {
+inline blas::Layout dist_to_layout(const DenseDist &D) {
+    randblas_require(D.major_axis != MajorAxis::Undefined);
     bool is_wide = D.n_rows < D.n_cols;
     bool fa_long = D.major_axis == MajorAxis::Long;
     if (is_wide && fa_long) {
@@ -345,22 +309,31 @@ inline blas::Layout dist_to_layout(
     }
 }
 
-inline int64_t major_axis_length(
-    DenseDist D
-) {
+inline int64_t major_axis_length(const DenseDist &D) {
+    randblas_require(D.major_axis != MajorAxis::Undefined);
     return (D.major_axis == MajorAxis::Long) ? 
         std::max(D.n_rows, D.n_cols) : std::min(D.n_rows, D.n_cols);
 }
 
+template <typename T>
+inline T isometry_scale_factor(DenseDist D) {
+    if (D.family == DenseDistName::BlackBox) {
+        throw std::runtime_error("Unrecognized distribution.");
+    }
+    // When we sample from the scalar distributions we always
+    // scale things so they're variance-1. 
+    return std::pow((T) std::min(D.n_rows, D.n_cols), -0.5);
+}
+
+
 // =============================================================================
 /// A sample from a distribution over dense sketching operators.
 ///
 template <typename T, typename RNG = r123::Philox4x32>
 struct DenseSkOp {
 
-    using generator = RNG;
-    using state_type = RNGState<RNG>;
-    using buffer_type = T;
+    using state_t  = RNGState<RNG>;
+    using scalar_t = T;
 
     const int64_t n_rows;
     const int64_t n_cols;
@@ -379,11 +352,13 @@ struct DenseSkOp {
     // ---------------------------------------------------------------------------
     ///  The state that should be used by the next call to an RNG *after* the
     ///  full sketching operator has been sampled.
-    RNGState<RNG> next_state;
+    const RNGState<RNG> next_state;
+
+
+    T *buff = nullptr;                      // memory
+    blas::Layout layout;                    // matrix storage order
+    bool del_buff_on_destruct = false;      // only applies if fill_dense(S) has been called.
 
-    T *buff = nullptr;                         // memory
-    const blas::Layout layout;                 // matrix storage order
-    bool del_buff_on_destruct = false;         // only applies if fill_dense(S) has been called.
 
     /////////////////////////////////////////////////////////////////////
     //
@@ -391,79 +366,58 @@ struct DenseSkOp {
     //
     /////////////////////////////////////////////////////////////////////
 
-    //  Elementary constructor: needs an implementation
     DenseSkOp(
+        int64_t n_rows,
+        int64_t n_cols,
         DenseDist dist,
-        RNGState<RNG> const &state,
-        T *buff
-    );
+        RNGState<RNG> const &seed_state,
+        RNGState<RNG> const &next_state,
+        T *buff,
+        blas::Layout layout,
+        bool del_buff_on_destruct
+    ) : 
+        n_rows(n_rows), n_cols(n_cols), dist(dist),
+        seed_state(seed_state), next_state(next_state),
+        buff(buff), layout(layout), del_buff_on_destruct(del_buff_on_destruct) { }
 
     ///---------------------------------------------------------------------------
-    /// The preferred constructor for DenseSkOp objects. There are other 
-    /// constructors, but they don't appear in the web documentation.
+    /// Construct a DenseSkOp object, \math{S}.
     ///
     /// @param[in] dist
-    ///     A DenseDist object.
-    ///     - Defines the number of rows and columns in this sketching operator.
-    ///     - Defines the (scalar-valued) distribution of each entry in this sketching operator.
+    ///     DenseDist.
+    ///     - Specifies the dimensions of \math{S}.
+    ///     - Specifies the (scalar) distribution of \math{S}'s entries.
     ///
     /// @param[in] state
-    ///     An RNGState object.
+    ///     RNGState.
     ///     - The RNG will use this as the starting point to generate all 
-    ///       random numbers needed for this sketching operator.
+    ///       random numbers needed for \math{S}.
     ///
     DenseSkOp(
         DenseDist dist,
         RNGState<RNG> const &state
-    ) : DenseSkOp(dist, state, nullptr) {};
-
-    //  Convenience constructor (a wrapper)
-    DenseSkOp(
-        DenseDist dist,
-        uint32_t key,
-        T *buff = nullptr
-    ) : DenseSkOp(dist, RNGState<RNG>(key), buff) {};
-
-    //  Convenience constructor (a wrapper)
-    DenseSkOp(
-        DenseDistName family,
-        int64_t n_rows,
-        int64_t n_cols,
-        uint32_t key,
-        T *buff = nullptr,
-        MajorAxis ma = MajorAxis::Long
-    ) : DenseSkOp(DenseDist{n_rows, n_cols, family, ma}, RNGState<RNG>(key), buff) {};
+    ) : // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        next_state(dense::compute_next_state(dist, state)),
+        buff(nullptr),
+        layout(dist_to_layout(dist))
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        if (dist.family == DenseDistName::BlackBox)
+            randblas_require(this->buff != nullptr);
+    };
 
     // Destructor
-    ~DenseSkOp();
+    ~DenseSkOp() {
+        if (this->del_buff_on_destruct)
+            delete [] this->buff;
+    }
 };
 
-template <typename T, typename RNG>
-DenseSkOp<T,RNG>::DenseSkOp(
-    DenseDist dist,
-    RNGState<RNG> const &state,
-    T *buff
-) : // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    next_state{},
-    buff(buff),
-    layout(dist_to_layout(dist))
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    if (dist.family == DenseDistName::BlackBox)
-        randblas_require(this->buff != nullptr);
-}
-
-template <typename T, typename RNG>
-DenseSkOp<T,RNG>::~DenseSkOp() {
-    if (this->del_buff_on_destruct) {
-        delete [] this->buff;
-    }
-}
 
 // =============================================================================
 /// @verbatim embed:rst:leading-slashes
@@ -475,10 +429,12 @@ DenseSkOp<T,RNG>::~DenseSkOp() {
 ///   .. |ncols| mathmacro:: \mathtt{n\_cols}
 ///   .. |ioff| mathmacro:: \mathtt{i\_off}
 ///   .. |joff| mathmacro:: \mathtt{j\_off}
+///   .. |layout| mathmacro:: \mathtt{layout}
 ///
 /// @endverbatim
-/// Fill \math{\buff} so that \math{\mat(\buff)} is a submatrix of
-/// an _implicit_ random sample from \math{\D}.
+/// Fill \math{\buff} so that (1) \math{\mat(\buff)} is a submatrix of
+/// an _implicit_ random sample from \math{\D}, and (2) \math{\mat(\buff)}
+/// is determined by reading from \math{\buff} in \math{\layout} order.
 /// 
 /// If we denote the implicit sample from \math{\D} by \math{S}, then we have
 /// @verbatim embed:rst:leading-slashes
@@ -487,6 +443,22 @@ DenseSkOp<T,RNG>::~DenseSkOp() {
 /// @endverbatim
 /// on exit.
 ///
+/// This function is for generating low-level representations of matrices
+/// that are equivalent to a submatrix of a RandBLAS DenseSkOp, but 
+/// without using the DenseSkOp abstraction. This can be useful if you want
+/// to sketch a structured matrix that RandBLAS doesn't support (like a symmetric
+/// matrix whose values are only stored in the upper or lower triangle).
+///
+/// Note that since the entries of \math{\buff} are sampled iid from a common
+/// distribution, the value of \math{\layout} is unlikely to have mathematical significance.
+/// However, the value of \math{\layout} can affect this function's efficiency.
+/// For best efficiency we recommend \math{\layout = \mathtt{dist\_to\_layout}(\D).}
+/// If a different value of \math{\layout} is used, then this function will internally
+/// allocate extra memory for an out-of-place change of storage order.
+///
+/// @param[in] layout
+///     blas::Layout::RowMajor or blas::Layout::ColMajor
+///      - The storage order for \math{\mat(\buff)} on exit.
 /// @param[in] D
 ///      A DenseDist object.
 ///      - A distribution over random matrices of shape (D.n_rows, D.n_cols).
@@ -507,63 +479,56 @@ DenseSkOp<T,RNG>::~DenseSkOp() {
 /// @param[in] buff
 ///     Buffer of type T.
 ///     - Length must be at least \math{\nrows \cdot \ncols}.
-///     - The leading dimension of \math{\mat(\buff)} when reading from \math{\buff}
-///       is either \math{\nrows} or \math{\ncols}, depending on the return value of this function
-///       that indicates row-major or column-major layout.
 /// @param[in] seed
 ///      A CBRNG state
 ///      - Used to define \math{S} as a sample from \math{\D}.
-///
-/// @returns
-///     A std::pair consisting of "layout" and "next_state".
-///     - \math{\buff} must be read in "layout" order 
-///       to recover \math{\mat(\buff)}. This layout is determined
-///       from \math{\D} and cannot be controlled directly.
-///     - If this function returns a layout that is undesirable then it is
-///       the caller's responsibility to perform a transpose as needed.
 /// 
-template<typename T, typename RNG>
-std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
-    const DenseDist &D,
-    int64_t n_rows,
-    int64_t n_cols,
-    int64_t ro_s,
-    int64_t co_s,
-    T* buff,
-    const RNGState<RNG> &seed
-) {
+template<typename T, typename RNG = r123::Philox4x32>
+RNGState<RNG> fill_dense(blas::Layout layout, const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t ro_s, int64_t co_s, T* buff, const RNGState<RNG> &seed) {
     using RandBLAS::dense::fill_dense_submat_impl;
     randblas_require(D.n_rows >= n_rows + ro_s);
     randblas_require(D.n_cols >= n_cols + co_s);
-    blas::Layout layout = dist_to_layout(D);
+    blas::Layout natural_layout = dist_to_layout(D);
     int64_t ma_len = major_axis_length(D);
     int64_t n_rows_, n_cols_, ptr;
-    if (layout == blas::Layout::ColMajor) {
+    if (natural_layout == blas::Layout::ColMajor) {
         // operate on the transpose in row-major
         n_rows_ = n_cols;
         n_cols_ = n_rows;
-        ptr = ro_s + co_s * ma_len;
+        ptr = ro_s + safe_int_product(co_s, ma_len);
     } else {
         n_rows_ = n_rows;
         n_cols_ = n_cols;
-        ptr = ro_s * ma_len + co_s;
+        ptr = safe_int_product(ro_s, ma_len) + co_s;
     }
+    RNGState<RNG> next_state{};
     switch (D.family) {
         case DenseDistName::Gaussian: {
-            auto next_state_g = fill_dense_submat_impl<T,RNG,r123ext::boxmul>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
-            return std::make_pair(layout, next_state_g);
+            next_state = fill_dense_submat_impl<T,RNG,r123ext::boxmul>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
+            break;
         }
         case DenseDistName::Uniform: {
-            auto next_state_u = fill_dense_submat_impl<T,RNG,r123ext::uneg11>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
-            return std::make_pair(layout, next_state_u);
+            next_state = fill_dense_submat_impl<T,RNG,r123ext::uneg11>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
+            blas::scal(n_rows_ * n_cols_, (T)std::sqrt(3), buff, 1);
+            break;
         }
         case DenseDistName::BlackBox: {
-            throw std::invalid_argument(std::string("fill_buff cannot be called with the BlackBox distribution."));
+            throw std::invalid_argument(std::string("fill_dense cannot be called with the BlackBox distribution."));
         }
         default: {
             throw std::runtime_error(std::string("Unrecognized distribution."));
         }
     }
+    int64_t size_mat = n_rows * n_cols;
+    if (layout != natural_layout) {
+        T* flip_work = new T[size_mat];
+        blas::copy(size_mat, buff, 1, flip_work, 1);
+        auto [irs_nat, ics_nat] = layout_to_strides(natural_layout, n_rows, n_cols);
+        auto [irs_req, ics_req] = layout_to_strides(layout, n_rows, n_cols);
+        util::omatcopy(n_rows, n_cols, flip_work, irs_nat, ics_nat, buff, irs_req, ics_req);
+        delete [] flip_work;
+    }
+    return next_state;
 }
  
 // =============================================================================
@@ -597,13 +562,9 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
 ///     - If this function returns a layout that is undesirable then it is
 ///       the caller's responsibility to perform a transpose as needed.
 /// 
-template <typename T, typename RNG>
-std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
-    const DenseDist &D,
-    T *buff,
-    const RNGState<RNG> &seed
-) {
-    return fill_dense(D, D.n_rows, D.n_cols, 0, 0,  buff, seed);
+template <typename T, typename RNG = r123::Philox4x32>
+RNGState<RNG> fill_dense(const DenseDist &D, T *buff, const RNGState<RNG> &seed) {
+    return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed);
 }
 
 // ============================================================================= 
@@ -618,23 +579,26 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
 ///
 /// @param[in] S
 ///     A DenseSkOp object.
-///
-/// @return
-///     An RNGState object. This is the state that should be used the next 
-///     time the program needs to generate random numbers for use in a randomized
-///     algorithm.
 ///    
-template <typename T, typename RNG>
-RNGState<RNG> fill_dense(
-    DenseSkOp<T,RNG> &S
-) {
-    randblas_require(!S.buff);
+template <typename DenseSkOp>
+void fill_dense(DenseSkOp &S) {
+    using T = typename DenseSkOp::scalar_t;
+    randblas_require(S.buff == nullptr);
     randblas_require(S.dist.family != DenseDistName::BlackBox);
     S.buff = new T[S.dist.n_rows * S.dist.n_cols];
-    auto [layout, next_state] = fill_dense<T, RNG>(S.dist, S.buff, S.seed_state);
-    S.next_state = next_state;
+    fill_dense(S.dist, S.buff, S.seed_state);
     S.del_buff_on_destruct = true;
-    return next_state;
+    return;
+}
+
+template <typename T, typename RNG>
+DenseSkOp<T,RNG> submatrix_as_blackbox(const DenseSkOp<T,RNG> &S, int64_t n_rows, int64_t n_cols, int64_t ro_s, int64_t co_s) {
+    T *buff = new T[n_rows * n_cols];
+    auto dl = dist_to_layout(S.dist);
+    fill_dense(dl, S.dist, n_rows, n_cols, ro_s, co_s, buff, S.seed_state);
+    DenseDist submatrix_dist{n_rows, n_cols, DenseDistName::BlackBox, MajorAxis::Undefined};
+    DenseSkOp<T,RNG> submatrix{n_rows, n_cols, submatrix_dist, S.seed_state, S.next_state, buff, dl, true};
+    return submatrix;
 }
 
 }  // end namespace RandBLAS
diff --git a/RandBLAS/random_gen.hh b/RandBLAS/random_gen.hh
index d7dcf127..f502e981 100644
--- a/RandBLAS/random_gen.hh
+++ b/RandBLAS/random_gen.hh
@@ -73,8 +73,7 @@ static inline void sincospi(double x, double *s, double *c) {
 #include <Random123/array.h>
 #include <Random123/philox.h>
 #include <Random123/threefry.h>
-#include <Random123/ars.h>
-// NOTE: we do not support Random123's AES generator.
+// NOTE: we do not support Random123's AES or ARS generators.
 #include <Random123/boxmuller.hpp>
 #include <Random123/uniform.hpp>
 
@@ -152,9 +151,9 @@ struct uneg11
      *
      * @tparam RNG a random123 CBRNG type
      *
-     * @param[in] a random123 CBRNG instance used to generate the sequence
-     * @param[in] the CBRNG counter
-     * @param[in] the CBRNG key
+     * @param[in] rng: a random123 CBRNG instance used to generate the sequence
+     * @param[in] c: CBRNG counter
+     * @param[in] k: CBRNG key
      *
      * @returns a std::array<N,T> where N is the CBRNG's ctr_type::static_size
      *          and T is deduced from the RNG's counter element type : float
diff --git a/RandBLAS/skge.hh b/RandBLAS/skge.hh
index 61590105..89ff455d 100644
--- a/RandBLAS/skge.hh
+++ b/RandBLAS/skge.hh
@@ -190,14 +190,8 @@ void lskge3(
 ){
     auto [rows_submat_S, cols_submat_S] = dims_before_op(d, m, opS);
     if (!S.buff) {
-        // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
-        // of that sketching operator, and then call LSKGE3 with that new object.
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        lskge3(layout, opS, opA, d, n, m, alpha, S_, 0, 0, A, lda, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        lskge3(layout, opS, opA, d, n, m, alpha, submat_S, 0, 0, A, lda, beta, B, ldb);
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
@@ -345,12 +339,8 @@ void rskge3(
     if (!S.buff) {
         // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
         // of that sketching operator, and then call RSKGE3 with that new object.
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        rskge3(layout, opA, opS, m, d, n, alpha, A, lda, S_, 0, 0, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        rskge3(layout, opA, opS, m, d, n, alpha, A, lda, submat_S, 0, 0, beta, B, ldb);
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
@@ -512,12 +502,9 @@ inline void lskges(
 ) {
     if (!S.known_filled)
         fill_sparse(S);
-    using RNG = typename SKOP::RNG_t;
-    using sint_t = typename SKOP::index_t;
-    auto Scoo = coo_view_of_skop<T,RNG,sint_t>(S);
+    auto Scoo = coo_view_of_skop(S);
     left_spmm(
-        layout, opS, opA, d, n, m, alpha, Scoo, ro_s, co_s,
-        A, lda, beta, B, ldb
+        layout, opS, opA, d, n, m, alpha, Scoo, ro_s, co_s, A, lda, beta, B, ldb
     );
     return;
 }
@@ -646,9 +633,7 @@ inline void rskges(
 ) { 
     if (!S.known_filled)
         fill_sparse(S);
-    using RNG = typename SKOP::RNG_t;
-    using sint = typename SKOP::index_t;
-    auto Scoo = coo_view_of_skop<T,RNG,sint>(S);
+    auto Scoo = coo_view_of_skop(S);
     right_spmm(
         layout, opA, opS, m, d, n, alpha, A, lda, Scoo, ro_s, co_s, beta, B, ldb
     );
diff --git a/RandBLAS/sparse_data/sksp.hh b/RandBLAS/sparse_data/sksp.hh
index 6be7912b..8045b4d8 100644
--- a/RandBLAS/sparse_data/sksp.hh
+++ b/RandBLAS/sparse_data/sksp.hh
@@ -166,12 +166,8 @@ void lsksp3(
     // B = op(submat(S)) @ op(submat(A))
     auto [rows_submat_S, cols_submat_S] = dims_before_op(d, m, opS);
     if (!S.buff) {
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        lsksp3(layout, opS, opA, d, n, m, alpha, S_, 0, 0, A, ro_a, co_a, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        lsksp3(layout, opS, opA, d, n, m, alpha, submat_S, 0, 0, A, ro_a, co_a, beta, B, ldb);
         return;
     }
 
@@ -324,12 +320,8 @@ void rsksp3(
 ) {
     auto [rows_submat_S, cols_submat_S] = dims_before_op(n, d, opS);
     if (!S.buff) {
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        rsksp3(layout, opA, opS, m, d, n, alpha, A, ro_a, co_a, S_, 0, 0, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        rsksp3(layout, opA, opS, m, d, n, alpha, A, ro_a, co_a, submat_S, 0, 0, beta, B, ldb);
         return;
     }
     auto [rows_submat_A, cols_submat_A] = dims_before_op(m, n, opA);
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index fe7c8aab..b6c85fcc 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -44,6 +44,89 @@
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
+namespace RandBLAS::sparse {
+
+
+// =============================================================================
+/// WARNING: this function is not part of the public API.
+///
+template <typename T, typename RNG, SignedInteger sint_t>
+static RNGState<RNG> repeated_fisher_yates(
+    const RNGState<RNG> &state,
+    int64_t vec_nnz,
+    int64_t dim_major,
+    int64_t dim_minor,
+    sint_t *idxs_major,
+    sint_t *idxs_minor,
+    T *vals
+) {
+    bool write_vals = vals != nullptr;
+    bool write_idxs_minor = idxs_minor != nullptr;
+    randblas_error_if(vec_nnz > dim_major);
+    std::vector<sint_t> vec_work(dim_major);
+    for (sint_t j = 0; j < dim_major; ++j)
+        vec_work[j] = j;
+    std::vector<sint_t> pivots(vec_nnz);
+    RNG gen;
+    auto [ctr, key] = state;
+    for (sint_t i = 0; i < dim_minor; ++i) {
+        sint_t offset = i * vec_nnz;
+        auto ctr_work = ctr;
+        ctr_work.incr(offset);
+        for (sint_t j = 0; j < vec_nnz; ++j) {
+            // one step of Fisher-Yates shuffling
+            auto rv = gen(ctr_work, key);
+            sint_t ell = j + rv[0] % (dim_major - j);
+            pivots[j] = ell;
+            sint_t swap = vec_work[ell];
+            vec_work[ell] = vec_work[j];
+            vec_work[j] = swap;
+            // update (rows, cols, vals)
+            idxs_major[j + offset] = (sint_t) swap;
+            if (write_vals)
+                vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
+            if (write_idxs_minor)
+                idxs_minor[j + offset] = (sint_t) i;
+            // increment counter
+            ctr_work.incr();
+        }
+        // Restore vec_work for next iteration of Fisher-Yates.
+        //      This isn't necessary from a statistical perspective,
+        //      but it makes it easier to generate submatrices of
+        //      a given SparseSkOp.
+        for (sint_t j = 1; j <= vec_nnz; ++j) {
+            sint_t jj = vec_nnz - j;
+            sint_t swap = idxs_major[jj + offset];
+            sint_t ell = pivots[jj];
+            vec_work[jj] = vec_work[ell];
+            vec_work[ell] = swap;
+        }
+    }
+    return RNGState<RNG> {ctr, key};
+}
+
+template <typename RNG, SignedInteger sint_t>
+inline RNGState<RNG> repeated_fisher_yates(
+    const RNGState<RNG> &state, int64_t k, int64_t n, int64_t r, sint_t *indices
+) {
+    return repeated_fisher_yates(state, k, n, r, indices, (sint_t*) nullptr, (double*) nullptr);
+}
+
+template <typename RNG, typename SD>
+RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> state) {
+    int64_t minor_len;
+    if (dist.major_axis == MajorAxis::Short) {
+        minor_len = std::min(dist.n_rows, dist.n_cols);
+    } else {
+        minor_len = std::max(dist.n_rows, dist.n_cols);
+    }
+    int64_t full_incr = minor_len * dist.vec_nnz;
+    state.counter.incr(full_incr);
+    return state;
+}
+
+}
+
 namespace RandBLAS {
 // =============================================================================
 /// A distribution over sparse matrices.
@@ -81,7 +164,18 @@ struct SparseDist {
     const MajorAxis major_axis = MajorAxis::Short;
 };
 
-using RandBLAS::SignedInteger;
+template <typename T>
+inline T isometry_scale_factor(SparseDist D) {
+    T vec_nnz = (T) D.vec_nnz;
+    if (D.major_axis == MajorAxis::Short) {
+        return std::pow(vec_nnz, -0.5); 
+    } else {
+        T minor_ax_len = (T) std::min(D.n_rows, D.n_cols);
+        T major_ax_len = (T) std::max(D.n_rows, D.n_cols);
+        return std::sqrt( major_ax_len / (vec_nnz * minor_ax_len) );
+    }
+}
+
 
 // =============================================================================
 /// A sample from a prescribed distribution over sparse matrices.
@@ -89,9 +183,9 @@ using RandBLAS::SignedInteger;
 template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
 struct SparseSkOp {
 
-    using RNG_t = RNG;
-    using T_t = T;
     using index_t = sint_t;
+    using state_t = RNGState<RNG>;
+    using scalar_t = T;
 
     const int64_t n_rows;
     const int64_t n_cols;
@@ -110,7 +204,7 @@ struct SparseSkOp {
     // ---------------------------------------------------------------------------
     ///  The state that should be used by the next call to an RNG *after* the
     ///  full sketching operator has been sampled.
-    RNGState<RNG> next_state;
+    const RNGState<RNG> next_state;
 
     // ---------------------------------------------------------------------------
     /// We need workspace to store a representation of the sampled sketching
@@ -178,7 +272,42 @@ struct SparseSkOp {
         sint_t *cols,
         T *vals,
         bool known_filled = true
-    );
+    ) : // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        own_memory(false),
+        next_state(sparse::compute_next_state(dist, seed_state))
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        this->rows = rows;
+        this->cols = cols;
+        this->vals = vals;
+        this->known_filled = known_filled;
+    };
+
+    // Useful for shallow copies (possibly with transposition)
+    SparseSkOp(
+        SparseDist dist,
+        const RNGState<RNG> &seed_state,
+        sint_t *rows,
+        sint_t *cols,
+        T *vals,
+        const RNGState<RNG> &next_state
+    ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(seed_state), next_state(next_state), own_memory(false) {
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        this->rows = rows;
+        this->cols = cols;
+        this->vals = vals;
+        this->known_filled = known_filled;
+    }
 
     SparseSkOp(
         SparseDist dist,
@@ -207,7 +336,29 @@ struct SparseSkOp {
     SparseSkOp(
         SparseDist dist,
         const RNGState<RNG> &state
-    );
+    ) :  // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        next_state(sparse::compute_next_state(dist, seed_state)),
+        own_memory(true)
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        int64_t minor_ax_len;
+        if (this->dist.major_axis == MajorAxis::Short) {
+            minor_ax_len = MAX(this->dist.n_rows, this->dist.n_cols);
+        } else { 
+            minor_ax_len = MIN(this->dist.n_rows, this->dist.n_cols);
+        }
+        int64_t nnz = this->dist.vec_nnz * minor_ax_len;
+        this->rows = new sint_t[nnz];
+        this->cols = new sint_t[nnz];
+        this->vals = new T[nnz];
+    }
 
     SparseSkOp(
         SparseDist dist,
@@ -216,68 +367,12 @@ struct SparseSkOp {
 
 
     //  Destructor
-    ~SparseSkOp();
-};
-
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::SparseSkOp(
-    SparseDist dist,
-    const RNGState<RNG> &state
-) :  // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    own_memory(true)
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    randblas_require(this->dist.vec_nnz > 0);
-    // actual work
-    int64_t minor_ax_len;
-    if (this->dist.major_axis == MajorAxis::Short) {
-        minor_ax_len = MAX(this->dist.n_rows, this->dist.n_cols);
-    } else { 
-        minor_ax_len = MIN(this->dist.n_rows, this->dist.n_cols);
-    }
-    int64_t nnz = this->dist.vec_nnz * minor_ax_len;
-    this->rows = new sint_t[nnz];
-    this->cols = new sint_t[nnz];
-    this->vals = new T[nnz];
-};
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::SparseSkOp(
-    SparseDist dist,
-    const RNGState<RNG> &state,
-    sint_t *rows,
-    sint_t *cols,
-    T *vals,
-    bool known_filled
-) :  // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    own_memory(false)
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    randblas_require(this->dist.vec_nnz > 0);
-    // actual work
-    this->rows = rows;
-    this->cols = cols;
-    this->vals = vals;
-    this->known_filled = known_filled;
-};
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::~SparseSkOp() {
-    if (this->own_memory) {
-        delete [] this->rows;
-        delete [] this->cols;
-        delete [] this->vals;
+    ~SparseSkOp() {
+        if (this->own_memory) {
+            delete [] this->rows;
+            delete [] this->cols;
+            delete [] this->vals;
+        }
     }
 };
 
@@ -290,36 +385,31 @@ SparseSkOp<T,RNG,sint_t>::~SparseSkOp() {
 ///
 /// @param[in] S
 ///     SparseSkOp object.
-///
-/// @return
-///     An RNGState object. This is the state that should be used the next 
-///     time the program needs to generate random numbers for a randomized
-///     algorithm.
 ///     
-template <typename T, typename RNG, SignedInteger sint_t>
-RNGState<RNG> fill_sparse(
-    SparseSkOp<T,RNG,sint_t> & S
-) {
+template <typename SparseSkOp>
+void fill_sparse(SparseSkOp &S) {
+    
     int64_t long_ax_len = MAX(S.dist.n_rows, S.dist.n_cols);
     int64_t short_ax_len = MIN(S.dist.n_rows, S.dist.n_cols);
-
     bool is_wide = S.dist.n_rows == short_ax_len;
+
+    using sint_t = typename SparseSkOp::index_t;
     sint_t *short_ax_idxs = (is_wide) ? S.rows : S.cols;
-    sint_t *long_ax_idxs = (is_wide) ? S.cols : S.rows;
+    sint_t *long_ax_idxs  = (is_wide) ? S.cols : S.rows;
 
     if (S.dist.major_axis == MajorAxis::Short) {
-        S.next_state = repeated_fisher_yates(
+        sparse::repeated_fisher_yates(
             S.seed_state, S.dist.vec_nnz, short_ax_len, long_ax_len,
             short_ax_idxs, long_ax_idxs, S.vals
         );
     } else {
-        S.next_state = repeated_fisher_yates(
+        sparse::repeated_fisher_yates(
             S.seed_state, S.dist.vec_nnz, long_ax_len, short_ax_len,
             long_ax_idxs, short_ax_idxs, S.vals
         );
     }
     S.known_filled = true;
-    return S.next_state;
+    return;
 }
 
 template <typename SKOP>
@@ -352,60 +442,6 @@ void print_sparse(SKOP const &S0) {
     std::cout << std::endl;
 }
 
-// =============================================================================
-/// WARNING: this function is not part of the public API.
-///
-template <typename T, typename RNG, SignedInteger sint_t>
-static auto repeated_fisher_yates(
-    const RNGState<RNG> &state,
-    int64_t vec_nnz,
-    int64_t dim_major,
-    int64_t dim_minor,
-    sint_t *idxs_major,
-    sint_t *idxs_minor,
-    T *vals
-) {
-    randblas_error_if(vec_nnz > dim_major);
-    std::vector<sint_t> vec_work(dim_major);
-    for (sint_t j = 0; j < dim_major; ++j)
-        vec_work[j] = j;
-    std::vector<sint_t> pivots(vec_nnz);
-    RNG gen;
-    auto [ctr, key] = state;
-    for (sint_t i = 0; i < dim_minor; ++i) {
-        sint_t offset = i * vec_nnz;
-        auto ctri = ctr;
-        ctri.incr(offset);
-        for (sint_t j = 0; j < vec_nnz; ++j) {
-            // one step of Fisher-Yates shuffling
-            auto rv = gen(ctri, key);
-            sint_t ell = j + rv[0] % (dim_major - j);
-            pivots[j] = ell;
-            sint_t swap = vec_work[ell];
-            vec_work[ell] = vec_work[j];
-            vec_work[j] = swap;
-            // update (rows, cols, vals)
-            idxs_major[j + offset] = (sint_t) swap;
-            vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
-            idxs_minor[j + offset] = (sint_t) i;
-            // increment counter
-            ctri.incr();
-        }
-        // Restore vec_work for next iteration of Fisher-Yates.
-        //      This isn't necessary from a statistical perspective,
-        //      but it makes it easier to generate submatrices of
-        //      a given SparseSkOp.
-        for (sint_t j = 1; j <= vec_nnz; ++j) {
-            sint_t jj = vec_nnz - j;
-            sint_t swap = idxs_major[jj + offset];
-            sint_t ell = pivots[jj];
-            vec_work[jj] = vec_work[ell];
-            vec_work[ell] = swap;
-        }
-        ctr = ctri;
-    }
-    return RNGState<RNG> {ctr, key};
-}
 
 } // end namespace RandBLAS
 
@@ -444,16 +480,12 @@ static int64_t nnz(
     }
 }
 
-
-template <typename T, typename RNG, SignedInteger sint_t>
-COOMatrix<T,sint_t> coo_view_of_skop(SparseSkOp<T,RNG,sint_t> &S) {
+template <typename SkOp, typename T = SkOp::scalar_t, typename sint_t = SkOp::index_t>
+COOMatrix<T, sint_t> coo_view_of_skop(SkOp &S) {
     if (!S.known_filled)
         fill_sparse(S);
     int64_t nnz = RandBLAS::sparse::nnz(S);
-    COOMatrix<T,sint_t> A(
-        S.dist.n_rows, S.dist.n_cols, nnz,
-        S.vals, S.rows, S.cols
-    );
+    COOMatrix<T, sint_t> A(S.dist.n_rows, S.dist.n_cols, nnz, S.vals, S.rows, S.cols);
     return A;
 }
 
diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index 31183362..22242449 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -29,6 +29,7 @@
 
 #pragma once
 
+#include <RandBLAS/base.hh>
 #include <RandBLAS/exceptions.hh>
 #include <blas.hh>
 #include <cstdio>
@@ -58,8 +59,7 @@ void safe_scal(int64_t n, T a, T* x, int64_t inc_x) {
 }
 
 template <typename T>
-void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, char label[])
-{
+void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, const char label[]) {
 	int64_t i, j;
     T val;
 	std::cout << "\n" << label << std::endl;
@@ -117,8 +117,7 @@ std::string type_name() { // call as type_name<obj>()
 }
 
 template <typename T>
-void symmetrize(blas::Layout layout, blas::Uplo uplo, T* A, int64_t n, int64_t lda) { 
-
+void symmetrize(blas::Layout layout, blas::Uplo uplo, int64_t n, T* A, int64_t lda) { 
     auto [inter_row_stride, inter_col_stride] = layout_to_strides(layout, lda);
     #define matA(_i, _j) A[(_i)*inter_row_stride + (_j)*inter_col_stride]
     if (uplo == blas::Uplo::Upper) {
@@ -140,6 +139,29 @@ void symmetrize(blas::Layout layout, blas::Uplo uplo, T* A, int64_t n, int64_t l
     return;
 }
 
+template <typename T>
+void overwrite_triangle(blas::Layout layout, blas::Uplo to_overwrite, int64_t n, int64_t strict_offset, T val,  T* A, int64_t lda) {
+    auto [inter_row_stride, inter_col_stride] = layout_to_strides(layout, lda);
+    #define matA(_i, _j) A[(_i)*inter_row_stride + (_j)*inter_col_stride]
+    if (to_overwrite == blas::Uplo::Upper) {
+        for (int64_t i = 0; i < n; ++i) {
+            for (int64_t j = i + strict_offset; j < n; ++j) {
+                matA(i,j) = val;
+            }
+        }
+    } else if (to_overwrite == blas::Uplo::Lower) {
+        for (int64_t i = 0; i < n; ++i) {
+            for (int64_t j = i + strict_offset; j < n; ++j) {
+                matA(j,i) = val;
+            }
+        }
+    } else {
+        throw std::runtime_error("Invalid argument for UPLO.");
+    }
+    #undef matA
+    return;
+}
+
 template <typename T>
 void require_symmetric(blas::Layout layout, const T* A, int64_t n, int64_t lda, T tol) { 
     if (tol < 0)
@@ -156,6 +178,8 @@ void require_symmetric(blas::Layout layout, const T* A, int64_t n, int64_t lda,
                 std::string message = "Symmetry check failed. |A(%i,%i) - A(%i,%i)| was %d, which exceeds tolerance of %d.";
                 auto _message = message.c_str();
                 randblas_error_if_msg(viol > rel_tol, _message, i, j, j, i, viol, rel_tol);
+                // ^ TODO: fix this macro. Apparently it doesn't print out all that I'd like. Example I just got:
+                //  "Symmetry check failed. |A(0,1) - A(1,0)| was 1610612736, which exceeds toleranc, in function require_symmetric" thrown in the test body.
             }
         }
     }
@@ -179,6 +203,22 @@ void transpose_square(T* A, int64_t n, int64_t lda) {
     return;
 }
 
+template <typename T>
+void omatcopy(int64_t m, int64_t n, const T* A, int64_t irs_a, int64_t ics_a, T* B, int64_t irs_b, int64_t ics_b) {
+    // TODO:
+    //     1. Order the loops with consideration to cache efficiency.
+    //     2. Vectorize one of the loops with blas::copy or std::memcpy.
+    #define MAT_A(_i, _j) A[(_i)*irs_a + (_j)*ics_a]
+    #define MAT_B(_i, _j) B[(_i)*irs_b + (_j)*ics_b]
+    for (int64_t i = 0; i < m; ++i) {
+        for (int64_t j = 0; j < n; ++j) {
+            MAT_B(i,j) = MAT_A(i,j);
+        }
+    }
+    #undef MAT_A
+    #undef MAT_B
+    return;
+}
 
 template <typename T>
 void flip_layout(blas::Layout layout_in, int64_t m, int64_t n, std::vector<T> &A, int64_t lda_in, int64_t lda_out) {
@@ -207,18 +247,9 @@ void flip_layout(blas::Layout layout_in, int64_t m, int64_t n, std::vector<T> &A
     std::vector<T> A_in(A);
     T* A_buff_in  = A_in.data();
     T* A_buff_out = A.data();
-
-    #define A_IN(_i, _j) A_buff_in[(_i)*irs_in + (_j)*ics_in]
-    #define A_OUT(_i, _j) A_buff_out[(_i)*irs_out + (_j)*ics_out]
-    for (int64_t i = 0; i < m; ++i) {
-        for (int64_t j = 0; j < n; ++j) {
-            A_OUT(i,j) = A_IN(i,j);
-        }
-    }
+    omatcopy(m, n, A_buff_in, irs_in, ics_in, A_buff_out, irs_out, ics_out);
     A.erase(A.begin() + len_buff_A_out, A.end());
     A.resize(len_buff_A_out);
-    #undef A_IN
-    #undef A_OUT
     return;
 }
 
@@ -253,7 +284,7 @@ static inline TO uneg11_to_uneg01(TI in) {
  */
 template <typename TF, typename int64_t, typename RNG>
 RNGState<RNG> sample_indices_iid(
-    int64_t n, TF* cdf, int64_t k, int64_t* samples, RandBLAS::RNGState<RNG> state
+    int64_t n, TF* cdf, int64_t k, int64_t* samples, RNGState<RNG> state
 ) {
     auto [ctr, key] = state;
     RNG gen;
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a6d36735..686a63ea 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,8 +6,8 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake")
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
-set(CMAKE_CXX_FLAGS "-O3")
-# ^ THAT'S SO IMPORTANT!!!!
+set(CMAKE_CXX_FLAGS "-O1")
+# ^ Doing something other than -O0 is important for good performance.
 #   TODO: just set CMAKE_BUILD_TYPE to Release if it's not already given.
 
 # TODO: do a try-catch pattern for finding OpenMP.
diff --git a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
index b1b557ac..4bbf7580 100644
--- a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
+++ b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
@@ -74,7 +74,7 @@ void iid_sparsify_random_dense(
 ) { 
     auto spar = new T[n_rows * n_cols];
     auto dist = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
-    auto [unused, next_state] = RandBLAS::fill_dense(dist, spar, state);
+    auto next_state = RandBLAS::fill_dense(dist, spar, state);
 
     auto temp = new T[n_rows * n_cols];
     auto D_mat = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
@@ -85,7 +85,7 @@ void iid_sparsify_random_dense(
     #define MAT(_i, _j)  mat[(_i) * stride_row + (_j) * stride_col]
     for (int64_t i = 0; i < n_rows; ++i) {
         for (int64_t j = 0; j < n_cols; ++j) {
-            T v = (SPAR(i, j) + 1.0) / 2.0;
+            T v = (SPAR(i, j) + std::sqrt(3)) / 2.0*std::sqrt(3);
             if (v < prob_of_zero) {
                 MAT(i, j) = 0.0;
             } else {
@@ -190,7 +190,7 @@ void make_noise_matrix(double noise_scale, int64_t m, int64_t n, double prob_of_
     //  NOTE: it would be more efficient to sample vec_nnz*vec_nnz elements without replacement from the index set
     //  from 0 to m*n-1, then de-vectorize those indices (in either row-major or col-major interpretation) and
     //  only sample the values of the nonzeros for these pre-determined structural nonzeros. The current implementation
-    //  has to generate to dense m-by-n matrices whose entries are iid uniform [-1, 1].
+    //  has to generate to dense m-by-n matrices whose entries are iid uniform [-sqrt(3), sqrt(3)].
     //
     using T = typename SpMat::scalar_t;
     using sint_t = typename SpMat::index_t;
diff --git a/rtd/source/api_reference/index.rst b/rtd/source/api_reference/index.rst
index e25e20a8..f0a3da6d 100644
--- a/rtd/source/api_reference/index.rst
+++ b/rtd/source/api_reference/index.rst
@@ -16,4 +16,5 @@ API Reference
     Representing sparse data <sparse_matrices>
     Computing a sketch: sparse data <sketch_sparse>
     Sparse BLAS operations <other_sparse>
+    Utilities for index sampling <index_sampling_utils>
 
diff --git a/rtd/source/api_reference/index_sampling_utils.rst b/rtd/source/api_reference/index_sampling_utils.rst
new file mode 100644
index 00000000..d690ac86
--- /dev/null
+++ b/rtd/source/api_reference/index_sampling_utils.rst
@@ -0,0 +1,13 @@
+
+############################################################
+Utilities for coordinate and index-set sampling
+############################################################
+
+    .. doxygenfunction:: RandBLAS::util::sample_indices_iid_uniform(int64_t n, int64_t k, int64_t* samples, RNGState<RNG> state)
+      :project: RandBLAS
+
+    .. doxygenfunction:: RandBLAS::util::sample_indices_iid(int64_t n, TF* cdf, int64_t k, int64_t* samples, RNGState<RNG> state)
+      :project: RandBLAS
+
+    .. doxygenfunction:: RandBLAS::util::weights_to_cdf(int64_t n, T* w, T error_if_below = -std::numeric_limits<T>::epsilon())
+      :project: RandBLAS
diff --git a/rtd/source/api_reference/skops_and_dists.rst b/rtd/source/api_reference/skops_and_dists.rst
index dd1ea2ea..910f7805 100644
--- a/rtd/source/api_reference/skops_and_dists.rst
+++ b/rtd/source/api_reference/skops_and_dists.rst
@@ -4,12 +4,36 @@ Distributions and sketching operators
 
 .. _rngstate_api:
 
-The state of a random number generator
-================================================
 
-.. doxygenstruct:: RandBLAS::RNGState
-   :project: RandBLAS
-   :members:
+Sketching distributions
+=======================
+
+.. doxygenconcept:: RandBLAS::SketchingDistribution
+    :project: RandBLAS
+
+.. doxygenfunction:: RandBLAS::isometry_scale_factor(SkDist D)
+    :project: RandBLAS
+
+
+Sketching operators and random states
+=====================================
+
+.. dropdown:: Sketching operators
+    :animate: fade-in-slide-down
+    :color: light
+    
+    .. doxygenconcept:: RandBLAS::SketchingOperator
+        :project: RandBLAS
+
+
+.. dropdown:: The state of a random number generator
+    :animate: fade-in-slide-down
+    :color: light
+
+    .. doxygenstruct:: RandBLAS::RNGState
+      :project: RandBLAS
+      :members:
+
 
 
 .. _densedist_and_denseskop_api:
@@ -28,7 +52,7 @@ DenseDist and DenseSkOp
    :project: RandBLAS
    :members: 
 
-.. doxygenfunction:: RandBLAS::fill_dense(DenseSkOp<T, RNG> &S)
+.. doxygenfunction:: RandBLAS::fill_dense(DenseSkOp &S)
    :project: RandBLAS
 
 
@@ -45,13 +69,13 @@ SparseDist and SparseSkOp
    :project: RandBLAS
    :members: 
 
-.. doxygenfunction:: RandBLAS::fill_sparse(SparseSkOp<T, RNG, sint_t> &S)
+.. doxygenfunction:: RandBLAS::fill_sparse(SparseSkOp &S)
    :project: RandBLAS
 
 
 Advanced material
 =================
 
-.. doxygenfunction:: RandBLAS::fill_dense(const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t S_ro, int64_t S_co, T *buff, const RNGState<RNG> &seed)
+.. doxygenfunction:: RandBLAS::fill_dense(blas::Layout layout, const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t S_ro, int64_t S_co, T *buff, const RNGState<RNG> &seed)
    :project: RandBLAS
 
diff --git a/rtd/source/conf.py b/rtd/source/conf.py
index 32b467c3..4b35eeb1 100644
--- a/rtd/source/conf.py
+++ b/rtd/source/conf.py
@@ -21,8 +21,8 @@
 # -- Project information -----------------------------------------------------
 
 project = 'RandBLAS'
-copyright = "2023, Riley J. Murray, Burlen Loring"
-author = "Riley J. Murray, Burlen Loring"
+copyright = "2024, Riley Murray, Burlen Loring"
+author = "Riley Murray, Burlen Loring"
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/rtd/source/index.rst b/rtd/source/index.rst
index e11921f9..0efd91d9 100644
--- a/rtd/source/index.rst
+++ b/rtd/source/index.rst
@@ -13,15 +13,18 @@
 RandBLAS: sketching for randomized numerical linear algebra
 ===========================================================
 
-RandBLAS is a C++ library for dimension reduction via random linear transformations.
-These random linear transformations are called *sketching operators*.
-The act of applying a sketching operator -- that is, the act of *sketching* -- is of fundamental importance to randomized numerical linear algebra.
+RandBLAS is a C++ library for randomized linear dimension reduction -- an operation commonly known as *sketching*.
+We built RandBLAS to make it easier to write, debug, and deploy high-performance implementations of sketching-based algorithms.
 
 RandBLAS is efficient, flexible, and reliable.
 It uses CPU-based OpenMP acceleration to apply its sketching operators to dense or sparse data matrices stored in main memory.
-All sketches produced by RandBLAS are dense.
-As such, dense data matrices can be sketched with dense or sparse operators, while sparse data matrices can only be sketched with dense operators.
-RandBLAS can be used in distributed environments through its ability to (reproducibly) compute products with *submatrices* of sketching operators.
+It includes dense and sparse sketching operators (e.g., Gaussian operators, CountSketch, OSNAPs, etc..), which can 
+be applied to dense or sparse data in any combination that leads to a dense sketch.
+
+With RandBLAS and an LAPACK-like library at your disposal, you can implement
+a huge range of shared-memory randomized algorithms for matrix computations.
+RandBLAS can be used in distributed environments through its ability to compute products with *submatrices* of sketching operators, 
+without ever realizing the entire sketching operator in memory.
 
 Learn more by reading our `tutorial <tutorial/index.html>`_ or our `API reference <api_reference/index.html>`_.
 If we've piqued your interest, try RandBLAS yourself!
diff --git a/rtd/source/tutorial/index.rst b/rtd/source/tutorial/index.rst
index 2b677809..aa0662fb 100644
--- a/rtd/source/tutorial/index.rst
+++ b/rtd/source/tutorial/index.rst
@@ -45,15 +45,12 @@ For example, it lets you set an integer-valued the seed when defining :math:`\te
 It even lets you compute products against *submatrices* of sketching operators without ever forming the full operator in memory.
 
 
-.. note::
-  This tutorial is very much incomplete! Bear with us for the time being.
-
-
 .. toctree::
     :maxdepth: 4
 
     Background on GEMM <gemm>
     Defining a sketching distribution <distributions>
     Sampling a sketching operator <sampling_skops>
+    Updating a sketch <updates>
     The meaning of "submat(・)" in RandBLAS documentation <submatrices>
 
diff --git a/rtd/source/tutorial/sampling_skops.rst b/rtd/source/tutorial/sampling_skops.rst
index aa0f814e..ee549bdb 100644
--- a/rtd/source/tutorial/sampling_skops.rst
+++ b/rtd/source/tutorial/sampling_skops.rst
@@ -50,7 +50,7 @@ However, the *recommended* constructors for these classes just accept two parame
 a representation of a distribution (i.e., a DenseDist or a SparseDist) and an RNGState.
 
 For example, the following code produces a :math:`10000 \times 50` dense sketching operator 
-whose entries are iid samples from the uniform distribution over :math:`[-1, 1]`.
+whose entries are iid samples from the standard normal distribution.
 
    .. code:: c++
 
@@ -71,9 +71,9 @@ Formal API docs for the recommended constructors can be found :ref:`here <densed
 Constructing your :math:`N^{\text{th}}` sketching operator, for :math:`N > 1`
 ==============================================================================
 
-Suppose you have an application that requires two independent dense sketching operators,
-:math:`\texttt{S1}` and :math:`\texttt{S2}`, each of size :math:`10000 \times 50`. 
-How should you get your hands on these objects?
+Suppose you have an application that requires two statistically independent dense
+sketching operators, :math:`\texttt{S1}` and :math:`\texttt{S2}`, each of size
+:math:`10000 \times 50`.  How should you get your hands on these objects?
 
 .. warning::
     If you try to construct those sketching operators as follows ...
@@ -88,33 +88,19 @@ How should you get your hands on these objects?
     *then your results would be invalid! Far from being independent,* :math:`\texttt{S1}`
     *and* :math:`\texttt{S2}` *would be equal from a mathematical perspective.*
 
-One correct approach is to first
-fill the entries of :math:`\texttt{S1}`, and then call 
-the constructor for :math:`\texttt{S2}` using :math:`\texttt{S1.next_state}`
-as its RNGState argument:
+One correct approach is to then call the constructor for :math:`\texttt{S2}`
+using :math:`\texttt{S1.next_state}` as its RNGState argument:
 
   .. code:: c++
 
     RandBLAS::RNGState my_state();
     RandBLAS::DenseDist my_dist(10000, 50);
     RandBLAS::DenseSkOp<double> S1(my_dist, my_state);
-    // ^ Defines S1 from a mathematical perspective, but performs no work.
-    //   RandBLAS hasn't made any calls to a CBRNG.
-    RandBLAS::fill_dense(S1);
-    // ^ RandBLAS uses a CBRNG to sample each entry of S1,
-    //   then sets S1.next_state.
+    // ^ Defines S1 from a mathematical perspective. Computes S1.next_state,
+    //   but otherwise performs no work.
     RandBLAS::DenseSkOp<double> S2(my_dist, S1.next_state);
 
-The shortcoming of this approach is that it requires explicitly filling
-:math:`\texttt{S1}` just to define :math:`\texttt{S2}` in a mathematical sense.
-There are two alternative approaches you can use if this shortcoming is 
-significant in your application.
-
-*Alternative 1*. Define one larger sketching operator
-(here, a :math:`10000\times 100` sketching operator) and operate with
-appropriate submatrices of that larger operator when needed.
-
-*Alternative 2*. Declare two RNGState objects from the beginning using
+Another valid approach is to declare two RNGState objects from the beginning using
 different keys, as in the following code:
 
   .. code:: c++
diff --git a/rtd/source/tutorial/updates.rst b/rtd/source/tutorial/updates.rst
new file mode 100644
index 00000000..a553d475
--- /dev/null
+++ b/rtd/source/tutorial/updates.rst
@@ -0,0 +1,128 @@
+*********************************************************************************************
+Updating sketches with dense sketching operators
+*********************************************************************************************
+
+  .. |denseskop| mathmacro:: \texttt{DenseSkOp}
+  .. |seedstate| mathmacro:: \texttt{seed_state}
+  .. |nextstate| mathmacro:: \texttt{next_state}
+  .. |mtx| mathmacro:: {}
+
+
+RandBLAS makes it easy to *implicitly* extend an initial sketching
+operator :math:`\mtx{S}_1` into an augmented operator :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` or :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]`.
+There are four scenarios that you can find yourself in where
+this can be done without generating :math:`S` from scratch.
+In all four scenarios, the idea is to
+use the :math:`\nextstate` of :math:`\mtx{S}_1` as the
+:math:`\seedstate` of :math:`\mtx{S}_2`.
+
+There are two reasons why you'd want to
+extend a sketching operator; you might be trying to improve statistical
+quality by increasing sketch size (Scenarios 1 and 4), or you might be
+incorporating new data into a sketch of fixed size (Scenarios 2 and 3).
+The unifying perspective on Scenarios 1 and 4 is that they both add
+*long-axis vectors* to the sketching operator.
+The unifying perspective on
+Scenarios 2 and 3 is that they both add *short-axis vectors* to the
+sketching operator. 
+
+:math:`\texttt{DenseDist}` objects have a :math:`\texttt{major_axis}` member, which states
+whether operators sampled from that distribution are short-axis or
+long-axis major. So when you specify the major axis for a sketching
+operator, you're basically saying whether you want to keep open the possibility of
+improving the statistical quality of a sketch or updating a sketch to
+incorporate more data.
+
+
+Scenario 1
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *wide* :math:`d_1 \times m` row-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate a 
+   :math:`d_2\times m` row-wise :math:`\denseskop` :math:`\mtx{S}_2` in such a way that
+   :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` is the same as the :math:`(d_1 + d_2) \times m` row-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This scenario arises if we have a fixed data matrix :math:`\mtx{A}`, an initial
+sketch :math:`\mtx{B}_1 = \mtx{S}_1 \mtx{A}`, and we decide we want a larger sketch for
+statistical reasons. The updated sketch :math:`\mtx{B} = \mtx{S} \mtx{A}` can be expressed as
+
+.. math::
+
+    \mtx{B} = \begin{bmatrix} \mtx{S}_1 \mtx{A} \\ \mtx{S}_2 \mtx{A} \end{bmatrix}.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then the final sketch
+:math:`\mtx{B} = \mtx{S}\mtx{A}` will be the same regardless of whether we computed the sketch
+in one step or two steps. This is desirable for benchmarking and
+debugging RandNLA algorithms where the sketch size is a tuning parameter.
+
+Scenario 2
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *wide* :math:`d \times m_1` column-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate a 
+   :math:`d \times m_2` column-wise :math:`\denseskop` :math:`\mtx{S}_2` so that 
+   :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]` is the same as the :math:`d \times (m_1 + m_2)` column-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This situation arises if we have an initial data matrix :math:`\mtx{A}_1`, an
+initial sketch :math:`\mtx{B}_1 = \mtx{S}_1 \mtx{A}_1`, and then a new matrix :math:`\mtx{A}_2` arrives in
+such a way that we want a sketch of :math:`A = [\mtx{A}_1; \mtx{A}_2]`. To compute :math:`\mtx{B} = \mtx{S}\mtx{A}`, 
+we update :math:`\mtx{B}_1` according to the formula
+
+.. math::
+
+   \mtx{B} = \begin{bmatrix} \mtx{S}_1 & \mtx{S}_2 \end{bmatrix} \begin{bmatrix} \mtx{A}_1 \\ \mtx{A}_2 \end{bmatrix} = \mtx{B}_1 + \mtx{S}_2 \mtx{A}_2.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then :math:`\mtx{B}` will be the
+same as though we started with all of :math:`\mtx{A}` from the very beginning. This
+is useful for benchmarking and debugging RandNLA algorithms that involve
+operating on data matrices that increase in size over some number of iterations.
+
+.. _scenario-3:
+
+Scenario 3
+==========
+
+   Let :math:`\mtx{S}_1` be a *tall* :math:`n \times d_1` column-wise :math:`\denseskop`
+   with seed :math:`c`. We can easily generate an :math:`n\times d_2` column-wise
+   :math:`\denseskop` :math:`\mtx{S}_2` so that :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]` is the same
+   as the :math:`d \times (n_1 + n_2)` column-wise :math:`\denseskop` with seed :math:`c`.
+
+This arises we have a fixed data matrix :math:`\mtx{A}`, an initial sketch :math:`\mtx{B}_1 = \mtx{A} \mtx{S}_1`,
+and we decide we want a larger sketch for statistical reasons. The
+updated sketch :math:`\mtx{B} = \mtx{A}\mtx{S}` can be expressed as
+
+.. math::
+
+    \mtx{B} = \begin{bmatrix} \mtx{A} \mtx{S}_1 & \mtx{A} \mtx{S}_2 \end{bmatrix}.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then the final sketch
+:math:`B` will be the same regardless of whether we computed the sketch in one
+step or two steps. This is desirable for benchmarking and debugging
+RandNLA algorithms where the sketch size is a tuning parameter.
+
+.. _scenario-4:
+
+Scenario 4
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *tall* :math:`n_1 \times d` row-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate an :math:`n_2\times d`
+   row-wise :math:`\denseskop` :math:`\mtx{S}_2` in such a way that
+   :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` is the same as the :math:`(n_1 + n_2) \times d` row-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This situation arises if we have an initial data matrix :math:`\mtx{A}_1`, an initial sketch 
+:math:`\mtx{B}_1 = \mtx{A}_1 \mtx{S}_1`, and then a new matrix :math:`\mtx{A}_2` arrives in such a way that we 
+want a sketch of :math:`\mtx{A} = [\mtx{A}_1, \mtx{A}_2]`. To compute :math:`\mtx{B} = \mtx{A}\mtx{S}`, we update :math:`\mtx{B}_1` 
+according to the formula
+
+.. math::
+
+   \mtx{B} = \begin{bmatrix} \mtx{A}_1 & \mtx{A}_2 \end{bmatrix} \begin{bmatrix} \mtx{S}_1 \\ \mtx{S}_2 \end{bmatrix} = \mtx{B}_1 + \mtx{A}_2 \mtx{S}_2.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then :math:`\mtx{B}` will be the same as though
+we started with all of :math:`\mtx{A}` from the very beginning. This is useful for benchmarking and 
+debugging RandNLA algorithms that involve operating on data matrices that increase in size over 
+some number of iterations.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c1c39ed7..e48cd660 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,11 +12,9 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(RandBLAS_tests
-        comparison.hh
         test_datastructures/test_denseskop.cc
         test_datastructures/test_sparseskop.cc
 
-        test_matmul_cores/linop_common.hh
         test_matmul_cores/test_lskge3.cc
         test_matmul_cores/test_rskge3.cc
         test_matmul_cores/test_lskges.cc
@@ -35,15 +33,11 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(SparseRandBLAS_tests
-        comparison.hh
-
-        test_datastructures/test_spmats/common.hh
         test_datastructures/test_spmats/test_csc.cc
         test_datastructures/test_spmats/test_csr.cc
         test_datastructures/test_spmats/test_coo.cc
         test_datastructures/test_spmats/test_conversions.cc
 
-        test_matmul_cores/test_spmm/spmm_test_helpers.hh
         test_matmul_cores/test_spmm/test_spmm_csc.cc
         test_matmul_cores/test_spmm/test_spmm_csr.cc
         test_matmul_cores/test_spmm/test_spmm_coo.cc
@@ -58,14 +52,28 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(RandBLAS_stats
-        test_basic_rng/rng_common.hh
-        test_basic_rng/test_sample_indices.cc
-        test_basic_rng/test_r123_kat.cc
+        test_basic_rng/test_r123.cc
+        test_basic_rng/test_discrete.cc
+        test_basic_rng/test_continuous.cc
+        test_basic_rng/test_distortion.cc
     )
     target_link_libraries(RandBLAS_stats RandBLAS GTest::GTest GTest::Main)
     gtest_discover_tests(RandBLAS_stats)
-    file(COPY test_basic_rng/kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
-    file(COPY test_basic_rng/kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/..)
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
+
+    #####################################################################
+    #
+    #   Meta tests (test our testing infrastructure)
+    #
+    #####################################################################
+
+    add_executable(MetaRandBLAS_tests
+        test_handrolled_lapack.cc
+    )
+    target_link_libraries(MetaRandBLAS_tests RandBLAS GTest::GTest GTest::Main)
+    gtest_discover_tests(MetaRandBLAS_tests)
 
 endif()
 message(STATUS "Checking for regression tests ... ${tmp}")
diff --git a/test/DevNotes.md b/test/DevNotes.md
index 6a2a1753..5e5b1dd2 100644
--- a/test/DevNotes.md
+++ b/test/DevNotes.md
@@ -4,7 +4,7 @@
 This document doesn't don't defend previous design decisions.
 It just explains how things work right now.
 That's easier for me (Riley) to write, and it's more useful to others.
-(Plus, a good description of what we do will make pros and cons of the current approach self-evident.)
+(Plus, it helps make the pros and cons of the current approach self-evident.)
 
 None of our testing infrastructure is considered part of the public API.
 
@@ -28,10 +28,19 @@ None of our testing infrastructure is considered part of the public API.
 I suspect that the tests for rskgex and right_spmm hit code paths that are currently untested,
 but I haven't actually verified this. 
 
-### next_folder ... 
+### test_basic_rng
 
-## Next topic ....
+  * test_r123.cc has deterministic tests for Random123. The tests comapre generated values
+    to reference values computed ahead of time. The tests are __extremely__ messy, since they're
+    adapted from tests in the official Random123 repository, and Random123 needs to handle a far wider
+    range of compilers and languages than we assume for RandBLAS.
 
+  * test_discrete.cc includes statistical tests for sampling from an index set with or without
+    replacement.
+
+  * rng_common.hh includes data for statistical tables (e.g., for Kolmogorov-Smirnov tests) and helper
+    functions to compute quantities associated with certain probability distributions (e.g., mean
+    and variance of the hypergeometric distribution).
 
 
 # OLD
@@ -65,7 +74,7 @@ Specifics:
     easy enough to have it reduce directly to GEMM, and reducing 
     directly to GEMM had the advantage of improved readibility. We don't
     test all possible combinations of flags (we omit when both arguments
-    are transposed) but the combinationed we leave untested are unrelated
+    are transposed) but the combination we leave untested are unrelated
     to flow-of-control.
 
     RSKGES reduces to right_spmm, which does indeed fall back on
diff --git a/test/comparison.hh b/test/comparison.hh
index 15efd9f1..a02cf4a9 100644
--- a/test/comparison.hh
+++ b/test/comparison.hh
@@ -81,6 +81,17 @@ bool approx_equal(T A, T B, std::ostream &str,
     return false;
 }
 
+template <typename T>
+void approx_equal(T A, T B, const char* testName, const char* fileName, int lineNo, T atol, T rtol) {
+    std::ostringstream oss;
+    if (!approx_equal(A, B, oss, atol, rtol)) {
+        FAIL() << std::endl << fileName << ":" << lineNo << std::endl
+            << testName << std::endl << "Test failed. " << oss.str() << std::endl;
+        oss.str("");
+    }
+    return;
+}
+
 
 
 /** Test two arrays are approximately equal elementwise.
diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
new file mode 100644
index 00000000..9b2fbd7e
--- /dev/null
+++ b/test/handrolled_lapack.hh
@@ -0,0 +1,307 @@
+
+#pragma once
+
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+
+#include <blas.hh>
+#include <iostream>
+#include <vector>
+#include <cmath>
+
+namespace hr_lapack {
+
+using RandBLAS::RNGState;
+
+template <typename T>
+int potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
+    // Cache access is much better if the matrix is lower triangular.
+    // Could implement as lower triangular and then call transpose_square.
+    for (int64_t j = 0; j < n; ++j) {
+        if (A[j + j * lda] <= 0) {
+            std::cout << "Cholesky failed at index " << (j+1) << " of " << n << ".";
+            return j+1;
+        }
+        A[j + j * lda] = std::sqrt(A[j + j * lda]);
+        for (int64_t i = j + 1; i < n; ++i) {
+            A[j + i * lda] /= A[j + j * lda];
+        }
+        for (int64_t k = j + 1; k < n; ++k) {
+            for (int64_t i = k; i < n; ++i) {
+                A[k + i * lda] -= A[j + i * lda] * A[j + k * lda];
+            }
+        }
+    }
+    return 0;
+}
+
+template <typename T>
+int potrf_upper(int64_t n, T* A, int64_t lda, int64_t b = 64) {
+    randblas_require(b > 0);
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+    int64_t curr_b = std::min(b, n);
+    //  A = [A11, A12]
+    //      [*  , A22]
+    int code = potrf_upper_sequential(curr_b, A, lda);
+    if (code != 0) {
+        std::cout << "Matrix indefinite. Returning early from potrf_upper.";
+        return code;
+    }
+    //  A = [R11, A12]
+    //      [*  , A22]
+    if (curr_b < n) {
+        T* R11 = A;                // shape (curr_b,     curr_b    )
+        T* A12 = R11 + curr_b*lda; // shape (curr_b,     n - curr_b)
+        T* A22 = A12 + curr_b;     // shape (n - curr_b, n - curr_b)
+        blas::trsm(
+            layout, blas::Side::Left, uplo, blas::Op::Trans, blas::Diag::NonUnit,
+            curr_b, n - curr_b, (T) 1.0, R11, lda, A12, lda
+        );
+        blas::syrk(layout, uplo, blas::Op::Trans, 
+            n - curr_b, curr_b, (T) -1.0, A12, lda, (T) 1.0, A22, lda
+        );
+        potrf_upper(n-curr_b, A22, lda, b);
+    }
+    return 0;
+}
+
+// If twice=true, then R will also be used as workspace, and must have length at least 2*n*n.
+template <typename T>
+void chol_qr(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32, bool twice = false) {
+    int64_t lda = m;
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+    std::fill(R, R + n*n, (T) 0.0);
+    blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T) 1.0, A, lda, (T) 0.0, R, n);
+    potrf_upper(n, R, n, chol_block_size);
+    blas::trsm(layout, blas::Side::Right, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, m, n, (T) 1.0, R, n, A, lda);
+    if (twice) {
+        T* R2 = R + n*n;
+        chol_qr(m, n, A, R2, chol_block_size, false);
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Lower, n, 1, (T) 0.0, R,  n);
+        // now overwrite R = R2 R with TRMM (saying R2 is the triangular matrix)
+        blas::trmm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, n, n, (T) 1.0, R2, n, R, n);
+    }
+    return;
+}
+
+// work must be length at least max(n, 2*b) * b
+template <typename T>
+void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, int64_t ldr, T* work, int64_t b) {
+    if (n > m)
+        throw std::runtime_error("Invalid dimensions.");
+    randblas_require(ldr >= n);
+
+    b = std::min(b, n);
+    auto layout = blas::Layout::ColMajor;
+    using blas::Op;
+    chol_qr(m, b, A, work, b, true);
+    T one  = (T) 1.0;
+    T zero = (T) 0.0;
+    T* R1 = work;
+    for (int64_t j = 0; j < b; ++j)
+        blas::copy(b, R1 + b*j, 1, R + ldr*j, 1);
+
+    if (b < n) {
+        int64_t n_trail = n - b;
+        T* A1 = A;           // A[:,  :b]
+        T* A2 = A + m * b;   // A[:,  b:]
+        T* R2 = R + ldr * b; // R[:b, b:]
+        // Compute A1tA2 := A1' * A2 and then update A2 -= A1 *  A1tA2
+        T* A1tA2 = work;
+        blas::gemm(layout, Op::Trans,   Op::NoTrans, b, n_trail, m,  one, A1, m, A2,    m, zero, A1tA2, b);
+        blas::gemm(layout, Op::NoTrans, Op::NoTrans, m, n_trail, b, -one, A1, m, A1tA2, b,  one, A2,    m);
+        // Copy A1tA2 to the appropriate place in R
+        for (int64_t j = 0; j < n_trail; ++j) {
+            blas::copy(b, A1tA2 + j*b, 1, R2 + j*ldr, 1);
+        }
+        qr_block_cgs(m, n_trail, A2, R2 + b, ldr, work, b);
+    }
+    return;
+}
+
+// We'll resize bigwork to be length at least (n*n + max(n, 2*b) * b).
+template <typename T>
+void qr_block_cgs2(int64_t m, int64_t n, T* A, T* R, std::vector<T> &bigwork, int64_t b = 64) {
+    b = std::min(b, n);
+    int64_t littlework_size = std::max(n, 2*b) * b;
+    int64_t bigwork_size = n*n + littlework_size;
+    if ((int64_t)bigwork.size() < bigwork_size) {
+        bigwork.resize(bigwork_size);
+    }
+    T* R2 = bigwork.data();
+    T* littlework = R2 + n*n;
+    std::fill(R, R + n * n, (T) 0.0);
+    qr_block_cgs(m, n, A, R, n, littlework, b);
+    RandBLAS::util::overwrite_triangle(blas::Layout::ColMajor, blas::Uplo::Lower, n, 1, (T) 0.0, R, n);
+    qr_block_cgs(m, n, A, R2, n, littlework, b);
+    blas::trmm(
+        blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
+        n, n, 1.0, R2, n, R, n
+    );
+    return;
+}
+
+template <typename T>
+bool extremal_eigvals_converged_gershgorin(int64_t n, T* G, T tol) {
+    int64_t i_lb = 0;
+    int64_t i_ub = 0;
+    T upper = -std::numeric_limits<T>::infinity();
+    T lower =  std::numeric_limits<T>::infinity();
+    for (int64_t i = 0; i < n; ++i) {
+        T radius = 0.0;
+        T center = G[i + i*n];
+        for (int64_t j = 0; j < n; ++j) {
+            if (i != j) {
+                radius += std::abs(G[i * n + j]);
+            }  
+        }
+        if (center + radius >= upper) {
+            i_ub = i;
+            upper = center + radius;
+        }
+        if (center - radius <= lower) {
+            i_lb = i;
+            lower = center - radius;
+        }
+    }
+    T lower_center = G[i_lb + i_lb*n];
+    T upper_center = G[i_ub + i_ub*n];
+    T lower_radius = std::abs(lower - lower_center);
+    T upper_radius = std::abs(upper - upper_center);
+    bool lower_converged = lower_radius <= tol * lower_center;
+    bool upper_converged = upper_radius <= tol * upper_center;
+    bool converged = lower_converged && upper_converged;
+    return converged;
+}
+
+/**
+ * Use Cholesky iteration to compute all eigenvalues of a positive definite matrix A.
+ * Run for at most "max_iters" iteration.
+ * 
+ * Use the Gershgorin circle theorem as a stopping criteria.
+ *
+ */
+template <typename T>
+int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
+    b = std::min(b, n);
+    std::vector<T> work(n*n);
+    T* G = work.data();
+    using blas::Op;
+    using blas::Layout;
+    using blas::Uplo;
+    int64_t iter = 0;
+    bool converged = false;
+    RandBLAS::RNGState state(1234567);
+    std::vector<int64_t> pivots(n, 0);
+    for (; iter < max_iters; ++iter) {
+        potrf_upper(n, A, n, b);
+        RandBLAS::util::overwrite_triangle(Layout::ColMajor, Uplo::Lower, n, 1, (T) 0.0, A, n);
+        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::NoTrans, n, n, (T) 1.0, A, n, (T) 0.0, G, n);
+        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, G, n);
+        for (int64_t i = 0; i < n; ++i)
+            eigvals[i] = G[i * n + i];
+        converged = extremal_eigvals_converged_gershgorin(n, G, reltol);
+        if (converged)
+            break;
+        blas::copy(n * n, G, 1, A, 1);
+    }
+    return (converged) ? iter : -iter;
+}
+
+template <typename T>
+inline int64_t required_powermethod_iters(int64_t n, T p_fail, T tol) {
+    T pi = 4*std::atan(1.0);
+    int64_t expectation_bound = (int64_t) std::ceil(( 1.0 + std::log(std::sqrt(pi * (T)n)) )/ tol );
+
+    T temp0 = 1 - tol;
+    T temp1 = std::log(1 / temp0);
+    T temp2 = tol * p_fail * p_fail;
+    int64_t probability_bound_1 = (int64_t) std::log(std::exp(1.) +  (T)0.27 * temp0 * temp1 / temp2) / temp1;
+    int64_t probability_bound_2 = (int64_t) std::log(std::sqrt(n) / p_fail) / temp1;
+    int64_t probability_bound   = std::min(probability_bound_1, probability_bound_2);
+
+    // std::cout << "(n, p, eps) = " << n << ", " << p_fail << ", " << tol << std::endl;
+    // std::cout << "Power iters bound for expectation : " << expectation_bound << std::endl;
+    // std::cout << "Power iters bound for probability : " << probability_bound << std::endl;
+    int64_t num_iters = std::max(expectation_bound, probability_bound);
+    return num_iters;
+}
+
+template <typename T, typename FUNC, typename RNG>
+std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, T failure_prob, const RNGState<RNG> &state) {
+    auto next_state = RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, v, state);
+    std::vector<T> work(n, 0.0);
+    T* u = work.data();
+    T norm = blas::nrm2(n, v, 1);
+    blas::scal(n, (T)1.0/norm, v, 1);
+    T lambda = 0.0;
+    //
+    int64_t num_iters = required_powermethod_iters(n, failure_prob, tol);
+    for (int64_t iter = 0; iter < num_iters; ++iter) {
+        A(v, u);
+        lambda = blas::dot(n, v, 1, u, 1);
+        blas::copy(n, u, 1, v, 1);
+        norm = blas::nrm2(n, v, 1);
+        blas::scal(n, (T)1.0/norm, v, 1);
+    }
+    return {lambda, next_state};
+}
+
+// Note that if we're only interested in subspace embedding distortion then it would suffice to just bound
+// the eigenvalue of A-I with largest absolute value (which might be negative). If we went with that approach
+// then we could make do with one run of a power method instead of running the power method on A and inv(A).
+//
+// The convergence results I know for the power method that don't require a spectral gap are specifically
+// for PSD matrices. Now, we could just run the power method implicitly on the PSD matrix (A - I)^2.
+// This require the same number of matrix-vector multiplications, but it remove the need for ever
+// accessing inv(A) as a linear operator (which we do right now by decomposing A and forming invA explicitly,
+// so we can get away with GEMV). That's useful if A is a fast operator (whether or not that's the case 
+// might be delicate since it's a Gram matrix of a sketch S*U).
+//
+template <typename T, typename RNG>
+std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eigvecs, T tol, T failure_prob, const RNGState<RNG> &state,  std::vector<T> work) {
+    auto layout = blas::Layout::ColMajor;
+    RandBLAS::util::require_symmetric(layout, A, n, n, (T) 0.0);
+
+    // Compute the dominant eigenpair. Nothing fancy here.
+    auto A_func = [layout, A, n](const T* x, T* y) {
+        blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, A, n, x, 1, (T) 0.0, y, 1);
+    };
+    auto [lambda_max, next_state] = power_method<T>(n, A_func, eigvecs, tol, failure_prob, state);
+
+    // To compute the smallest eigenpair we'll explicitly invert A. This requires
+    // 2n^2 workspace: n^2 workspace for Cholesky of A (since we don't want to destroy A)
+    // and another n^2 workspace for TRSMs with the Cholesky factor to get invA.
+    //
+    // Note: we *could* use less workspace if we were willing to access invA as a linear
+    // operator using two calls to TRSV when needed. But this ends up being much slower
+    // than explicit inversion for the values of (n, tol) that we care about.
+    //
+    if ((int64_t) work.size() < 2*n*n)
+        work.resize(2*n*n);
+    T* chol = work.data();
+    blas::copy(n*n, A, 1, chol, 1);
+    potrf_upper(n, chol, n);
+    T* invA = chol + n*n;
+    std::fill(invA, invA + n*n, 0.0);
+    for (int i = 0; i < n; ++i)
+        invA[i + i*n] = 1.0;
+    auto uplo   = blas::Uplo::Upper;
+    auto diag   = blas::Diag::NonUnit;
+    blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans,   diag, n, n, (T) 1.0, chol, n, invA, n);
+    blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, (T) 1.0, chol, n, invA, n);
+
+    // Now that we have invA explicitly, getting its dominant eigenpair is effortless.
+    auto invA_func = [layout, invA, n](const T* x, T* y) {
+        blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, invA, n, x, 1, (T) 0.0, y, 1);
+        return;
+    };
+    auto [lambda_min, final_state] = power_method<T>(n, invA_func, eigvecs + n, tol, failure_prob, next_state);
+    lambda_min = 1.0/lambda_min;
+
+    return {lambda_max, lambda_min, final_state};
+}
+
+}
diff --git a/test/test_basic_rng/kat_vectors.txt b/test/test_basic_rng/r123_kat_vectors.txt
similarity index 97%
rename from test/test_basic_rng/kat_vectors.txt
rename to test/test_basic_rng/r123_kat_vectors.txt
index 21c453dc..9398f32c 100644
--- a/test/test_basic_rng/kat_vectors.txt
+++ b/test/test_basic_rng/r123_kat_vectors.txt
@@ -68,7 +68,7 @@ threefry4x64 13 0000000000000000 0000000000000000 0000000000000000 0000000000000
 threefry4x64 13 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 7eaed935479722b5 90994358c429f31c 496381083e07a75b 627ed0d746821121
 threefry4x64 13 243f6a8885a308d3 13198a2e03707344 a4093822299f31d0 082efa98ec4e6c89 452821e638d01377 be5466cf34e90c6c c0ac29b7c97c50dd 3f84d5b5b5470917 4361288ef9c1900c 8717291521782833 0d19db18c20cf47e a0b41d63ac8581e5
 threefry4x64 20 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000   09218ebde6c85537 55941f5266d86105 4bd25e16282434dc ee29ec846bd2e40b
- threefry4x64 20 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 29c24097942bba1b 0371bbfb0f6f4e11 3c231ffa33f83a1c cd29113fde32d168
+threefry4x64 20 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 29c24097942bba1b 0371bbfb0f6f4e11 3c231ffa33f83a1c cd29113fde32d168
 threefry4x64 20 243f6a8885a308d3 13198a2e03707344 a4093822299f31d0 082efa98ec4e6c89 452821e638d01377 be5466cf34e90c6c be5466cf34e90c6c c0ac29b7c97c50dd   a7e8fde591651bd9 baafd0c30138319b 84a5c1a729e685b9 901d406ccebc1ba4
 threefry4x64 72 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 94eeea8b1f2ada84 adf103313eae6670 952419a1f4b16d53 d83f13e63c9f6b11
 threefry4x64 72 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 11518c034bc1ff4c 193f10b8bcdcc9f7 d024229cb58f20d8 563ed6e48e05183f
diff --git a/test/test_basic_rng/r123_rngNxW.mm b/test/test_basic_rng/r123_rngNxW.mm
index 5062522d..9112b11d 100644
--- a/test/test_basic_rng/r123_rngNxW.mm
+++ b/test/test_basic_rng/r123_rngNxW.mm
@@ -51,7 +51,3 @@
 RNGNxW_TPL(threefry, 2, 64)
 RNGNxW_TPL(threefry, 4, 64)
 #endif
-#if R123_USE_AES_NI
-RNGNxW_TPL(ars, 4, 32)
-RNGNxW_TPL(aesni, 4, 32)
-#endif
diff --git a/test/test_basic_rng/rng_common.hh b/test/test_basic_rng/rng_common.hh
index b931af6c..736d8734 100644
--- a/test/test_basic_rng/rng_common.hh
+++ b/test/test_basic_rng/rng_common.hh
@@ -1,3 +1,32 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
 #pragma once
 
 #include "RandBLAS.hh"
@@ -15,24 +44,24 @@ namespace KolmogorovSmirnovConstants {
 
 /*** From scipy.stats:  critical_value = kstwo.ppf(1-significance, sample_size) */
 
-const int SMALLEST_SAMPLE = 8;
-const int LARGEST_SAMPLE = 16777216;
+inline const int SMALLEST_SAMPLE = 8;
+inline const int LARGEST_SAMPLE = 16777216;
 
-const std::vector<int> sample_sizes {
+inline const std::vector<int> sample_sizes {
           8,       16,       32,       64,      128,      256,
         512,     1024,     2048,     4096,     8192,    16384,
       32768,    65536,   131072,   262144,   524288,  1048576,
     2097152,  4194304,  8388608, 16777216
 };
 
-const double WEAKEST_SIGNIFICANCE = 0.05;
-const double STRONGEST_SIGNIFICANCE = 1e-6;
+inline const double WEAKEST_SIGNIFICANCE = 0.05;
+inline const double STRONGEST_SIGNIFICANCE = 1e-6;
 
-const std::vector<double> significance_levels {
+inline const std::vector<double> significance_levels {
     0.05, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6
 };
 
-const std::array<const std::array<double, 22>, 6> critical_values {{{
+inline const std::array<const std::array<double, 22>, 6> critical_values {{{
     // significance of 0.05
     4.54266591e-01, 3.27333470e-01, 2.34240860e-01, 1.66933746e-01,
     1.18658276e-01, 8.42018587e-02, 5.96844982e-02, 4.22742678e-02,
@@ -89,7 +118,7 @@ const std::array<const std::array<double, 22>, 6> critical_values {{{
  * The correctness of this function depends on significance_levels being sorted
  * in decreasing order (which corresponds to weakest to strongest significances).
  */
-int significance_rep(double sig) {
+inline int significance_rep(double sig) {
     randblas_require(STRONGEST_SIGNIFICANCE <= sig && sig <= WEAKEST_SIGNIFICANCE);
     int num_siglevels = (int) significance_levels.size();
     for (int i = 0; i < num_siglevels; ++i) {
@@ -107,11 +136,11 @@ int significance_rep(double sig) {
  * The correctness of this function depends on sample_sizes being sorted in
  * increasing order.
  */
-int sample_size_rep(int n) {
+inline int sample_size_rep(int n) {
     randblas_require(SMALLEST_SAMPLE <= n && n <= LARGEST_SAMPLE);
     int num_sample_sizes = (int) sample_sizes.size();
     for (int i = 0; i < num_sample_sizes; ++i) {
-        if (sample_sizes[i] <= n)
+        if (sample_sizes[i] >= n)
             return i;
     }
     // This code shouldn't be reachable!
@@ -119,7 +148,7 @@ int sample_size_rep(int n) {
     return -1;
 }
 
-std::tuple<double,int,double> critical_value_rep(int n, double sig) {
+inline std::tuple<double,int,double> critical_value_rep(int n, double sig) {
     int i = significance_rep(sig);
     auto override_sig = significance_levels[i];
     int j = sample_size_rep(n);
@@ -140,11 +169,25 @@ double critical_value_rep_mutator(TI &n, double &sig) {
 
 }
 
+// Function to check the KS-Stat against crit values
+template <typename T>
+std::pair<int, double> ks_check_critval(const std::vector<T> &cdf1, const std::vector<T> &cdf2, double critical_value) {
+    assert(cdf1.size() == cdf2.size()); // Vectors must be of same size to perform test
+
+    for (size_t i = 0; i < cdf1.size(); ++i) {
+        double diff = std::abs(cdf1[i] - cdf2[i]);
+        if (diff > critical_value) {
+            return {i, diff}; // the test failed.
+        }
+    }
+    return {-1, 0.0}; // interpret a negative return value as the test passing.
+}
+
 //
 // MARK: combinatorics
 //
 
-double log_binomial_coefficient(int64_t n, int64_t k) {
+inline double log_binomial_coefficient(int64_t n, int64_t k) {
     double result = 0.0;
     for (int64_t i = 1; i <= k; ++i) {
         result += std::log(static_cast<double>(n - i + 1)) - std::log(static_cast<double>(i));
@@ -165,8 +208,7 @@ double log_binomial_coefficient(int64_t n, int64_t k) {
  *      This function returns the probability that the sample of D items will contain observed_k elements
  *      from the distinguished set.
  */
-double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k)
-{
+inline double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k) {
     randblas_require(0 <= K && K <= N);
     randblas_require(0 <= D && D <= N);
     randblas_require(0 <= observed_k && observed_k <= K);
@@ -187,8 +229,8 @@ double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k)
     return out;
 }
 
-std::vector<float> hypergeometric_pmf_arr(int64_t N, int64_t K, int64_t D)
-{
+template <typename T>
+std::vector<T> hypergeometric_pmf_arr(int64_t N, int64_t K, int64_t D) {
     randblas_require(0 <= K && K <= N);
     randblas_require(0 <= D && D <= N);
 
@@ -199,14 +241,14 @@ std::vector<float> hypergeometric_pmf_arr(int64_t N, int64_t K, int64_t D)
     return pmf;
 }
 
-double hypergeometric_mean(int64_t N, int64_t K, int64_t D) {
+inline double hypergeometric_mean(int64_t N, int64_t K, int64_t D) {
     double dN = (double) N;
     double dK = (double) K;
     double dD = (double) D;
     return dD * dK / dN;
 }
 
-double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
+inline double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
     double dN = (double) N;
     double dK = (double) K;
     double dD = (double) D;
@@ -217,22 +259,21 @@ double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
     return dD * t1 * t2 * t3;
 }
 
-//
-// MARK Kolmogorov-Smirnov Calculations
-//
+// MARK: continuous distributions
 
-// Function to check the KS-Stat against crit values
-std::pair<int, double> ks_check_critval(const std::vector<float> &cdf1, const std::vector<float> &cdf2, double critical_value)
-{
-    assert(cdf1.size() == cdf2.size()); // Vectors must be of same size to perform test
+template <typename T>
+inline T standard_normal_cdf(T x) {
+    double dx = (double) x;
+    return (T) std::erfc(-dx / std::sqrt(2)) / 2;
+}
 
-    for (size_t i = 0; i < cdf1.size(); ++i) {
-        double diff = std::abs(cdf1[i] - cdf2[i]);
-        if (diff > critical_value) {
-            return {i, diff}; // the test failed.
-        }
-    }
-    return {-1, 0.0}; // interpret a negative return value as the test passing.
+template <typename T>
+inline T uniform_syminterval_cdf(T x, T radius) {
+    if (x <= -radius)
+        return 0;
+    if (x >= radius)
+        return 1;
+    return (x + radius) / (2*radius);
 }
 
-} // end namespace RandBLAS_StatTests
\ No newline at end of file
+} // end namespace RandBLAS_StatTests
diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
new file mode 100644
index 00000000..03b5d87d
--- /dev/null
+++ b/test/test_basic_rng/test_continuous.cc
@@ -0,0 +1,166 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+using RandBLAS::RNGState;
+using RandBLAS::DenseDistName;
+#include "rng_common.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <set>
+#include <vector>
+#include <gtest/gtest.h>
+#include <stdexcept>
+
+
+
+class TestScalarDistributions : public ::testing::Test {
+    protected:
+
+    // This is really for distributions whose CDFs "F" are continuous and strictly increasing
+    // on the interval [a, b] where F(a) = 0 and F(b) = 1.
+ 
+    template <typename T>
+    static void kolmogorov_smirnov_tester(
+        std::vector<T> &samples, double critical_value, DenseDistName dn
+    ) { 
+        auto F_true = [dn](T x) {
+            if (dn == DenseDistName::Gaussian) {
+                return RandBLAS_StatTests::standard_normal_cdf(x);
+            } else if (dn == DenseDistName::Uniform) {
+                return RandBLAS_StatTests::uniform_syminterval_cdf(x, (T) std::sqrt(3));
+            } else {
+                std::string msg = "Unrecognized distributions name";
+                throw std::runtime_error(msg);
+            }
+        };
+        auto N = (int) samples.size();
+        /** 
+         *  Let L(x) = |F_empirical(x) - F_true(x)|. The KS test testatistic is
+         *
+         *      ts = sup_{all x} L(x).
+         * 
+         *  Now set s = sorted(samples), and partition the real line into
+         * 
+         *      I_0     = (-infty,  s[0  ]),  ...
+         *      I_1     = [s[0  ],  s[1  ]),  ...
+         *      I_2     = [s[1  ],  s[2  ]),  ...
+         *      I_{N-1} = [s[N-2],  s[N-1]),  ...
+         *      I_N     = [s[N-1],  +infty).
+         * 
+         *  Then, provided F_true is continuous, we have 
+         * 
+         *      sup{ L(x) : x in I_j } = max{ 
+         *              |F_true(inf(I_j)) - j/N|, |F_true(sup(I_j)) - j/N|
+         *      }
+         * 
+         *  for j = 0, ..., N.
+         */
+        samples.push_back( std::numeric_limits<T>::infinity());
+        samples.push_back(-std::numeric_limits<T>::infinity());
+        std::sort(samples.begin(), samples.end(), [](T a, T b) {return (a < b);});
+        for (int64_t j = 0; j <= N; ++j) {
+            T temp1 = F_true(samples[j    ]);
+            T temp2 = F_true(samples[j + 1]);
+            T empirical = ((T)j)/((T)N);
+            T val1 = std::abs(temp1 - empirical);
+            T val2 = std::abs(temp2 - empirical);
+            T supLx_on_Ij = std::max(val1, val2);
+            ASSERT_LE(supLx_on_Ij, critical_value) 
+                << "\nj = " << j << " of N = " << N
+                << "\nF_true(inf(I_j)) = " << temp1 
+                << "\nF_true(sup(I_j)) = " << temp2 
+                << "\nI_j = [" << samples[j] << ", " << samples[j+1] << ")";
+        }
+        return;
+    }
+
+    template <typename T>
+    static void run(double significance, int64_t num_samples, DenseDistName dn, uint32_t seed) {
+        using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
+        auto critical_value = critical_value_rep_mutator(num_samples, significance);
+        RNGState state(seed);
+        std::vector<T> samples(num_samples, -1);
+        RandBLAS::fill_dense({num_samples, 1, dn, RandBLAS::MajorAxis::Long}, samples.data(), state);
+        kolmogorov_smirnov_tester(samples, critical_value, dn);
+        return;
+    }
+};
+
+TEST_F(TestScalarDistributions, uniform_ks_generous) {
+    double s = 1e-6;
+    for (uint32_t i = 999; i < 1011; ++i) {
+        run<double>(s, 100000, DenseDistName::Uniform, i);
+        run<double>(s, 10000,  DenseDistName::Uniform, i*i);
+        run<double>(s, 1000,   DenseDistName::Uniform, i*i*i);
+    }
+}
+
+TEST_F(TestScalarDistributions, uniform_ks_moderate) {
+    double s = 1e-4;
+    run<float>(s, 100000, DenseDistName::Uniform, 0);
+    run<float>(s, 10000,  DenseDistName::Uniform, 0);
+    run<float>(s, 1000,   DenseDistName::Uniform, 0);
+}
+
+TEST_F(TestScalarDistributions, uniform_ks_skeptical) {
+    double s = 1e-2;
+    run<float>(s, 100000, DenseDistName::Uniform, 0);
+    run<float>(s, 10000,  DenseDistName::Uniform, 0);
+    run<float>(s, 1000,   DenseDistName::Uniform, 0);
+}
+
+TEST_F(TestScalarDistributions, guassian_ks_generous) {
+    double s = 1e-6;
+    for (uint32_t i = 99; i < 103; ++i) {
+        run<double>(s, 100000, DenseDistName::Gaussian, i);
+        run<double>(s, 10000,  DenseDistName::Gaussian, i*i);
+        run<double>(s, 1000,   DenseDistName::Gaussian, i*i*i);
+    }
+}
+
+TEST_F(TestScalarDistributions, guassian_ks_moderate) {
+    double s = 1e-4;
+    run<float>(s, 100000, DenseDistName::Gaussian, 0);
+    run<float>(s, 10000,  DenseDistName::Gaussian, 0);
+    run<float>(s, 1000,   DenseDistName::Gaussian, 0);
+}
+
+TEST_F(TestScalarDistributions, guassian_ks_skeptical) {
+    double s = 1e-2;
+    run<float>(s, 100000, DenseDistName::Gaussian, 0);
+    run<float>(s, 10000,  DenseDistName::Gaussian, 0);
+    run<float>(s, 1000,   DenseDistName::Gaussian, 0);
+}
diff --git a/test/test_basic_rng/test_sample_indices.cc b/test/test_basic_rng/test_discrete.cc
similarity index 88%
rename from test/test_basic_rng/test_sample_indices.cc
rename to test/test_basic_rng/test_discrete.cc
index 852d7692..92014d29 100644
--- a/test/test_basic_rng/test_sample_indices.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -165,8 +165,7 @@ class TestSampleIndices : public ::testing::Test
     // Mark: Without Replacement Tests
     //
 
-    static std::vector<float> fisher_yates_cdf(const std::vector<int64_t> &idxs_major, int64_t K, int64_t num_samples)
-    {
+    static std::vector<float> fisher_yates_cdf(const std::vector<int64_t> &idxs_major, int64_t K, int64_t num_samples) {
         using RandBLAS::util::weights_to_cdf;
         std::vector<float> empirical_cdf;
 
@@ -212,61 +211,50 @@ class TestSampleIndices : public ::testing::Test
         }
     }
 
-    static void single_test_fisher_yates_kolmogorov_smirnov(int64_t N, int64_t K, double significance, int64_t num_samples, uint32_t seed)
-    {
-        using RandBLAS::repeated_fisher_yates;
+    static void single_test_fisher_yates_kolmogorov_smirnov(int64_t N, int64_t K, double significance, int64_t num_samples, uint32_t seed) {
+        using RandBLAS::sparse::repeated_fisher_yates;
         using RandBLAS_StatTests::hypergeometric_pmf_arr;
         using RandBLAS::util::weights_to_cdf;
         using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
 
+        auto critical_value = critical_value_rep_mutator(num_samples, significance);
+
         // Initialize arguments for repeated_fisher_yates
-        std::vector<int64_t> idxs_major(K * num_samples);
-        std::vector<int64_t> idxs_minor(K * num_samples);
-        std::vector<double> vals(K * num_samples);
+        std::vector<int64_t> indices(K * num_samples);
         RNGState state(seed);
 
         // Generate repeated Fisher-Yates in idxs_major
-        state = repeated_fisher_yates(
-            state, K, N, num_samples, // K=vec_nnz, N=dim_major, num_samples=dim_minor
-            idxs_major.data(), idxs_minor.data(), vals.data());
+        state = repeated_fisher_yates(state, K, N, num_samples, indices.data());
 
-        // Generate the true hypergeometric cdf
-        std::vector<float> true_pmf = hypergeometric_pmf_arr(N, K, K);
-        weights_to_cdf(true_pmf.size(), true_pmf.data());
-        std::vector<float> true_cdf = true_pmf; // Rename for clarity
+        // Generate the true hypergeometric cdf (get the pdf first)
+        std::vector<float> true_cdf = hypergeometric_pmf_arr<float>(N, K, K);
+        weights_to_cdf(true_cdf.size(), true_cdf.data());
 
         // Compute the critval and check against it
-        auto critical_value = critical_value_rep_mutator(num_samples, significance);
-        fisher_yates_kolmogorov_smirnov_tester(idxs_major, true_cdf, critical_value, N, K, num_samples);
+        fisher_yates_kolmogorov_smirnov_tester(indices, true_cdf, critical_value, N, K, num_samples);
 
         return;
     }
 
-    static int64_t special_increment_k(int64_t K, int64_t N, const std::vector<int64_t> &K_bounds = {9, 99})
-    {
-        //
-        // Example evolution of K with N=1000: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 23, 39, 64, 100,
-        //
-        double exp_base = std::pow(10.0, 1.0 / 10.0); // Log base to give 10 steps for each order of magnitude
-
-        if (K <= K_bounds[0]) {
-            // Step one-by-one up to K_bounds[0] (e.g., 9)
-            return K + 1;
-        } else if (K <= K_bounds[1]) {
-            // Step in square root scale up to K_bounds[1] (e.g., 99)
-            return static_cast<int64_t>(std::pow(std::sqrt(K) + 1, 2));
+    static void incr_with_phase_transitions(int64_t &K, int64_t unit_bound = 10, int64_t sqrt_bound = 100) {
+        if (K < unit_bound) {
+            K += 1;
+        } else if (K < sqrt_bound) {
+            // Step in square root scale up to sqrt_bound
+            K += (int64_t) std::pow(std::sqrt(K) + 1, 2);
         } else {
-            // Step in log-scale after K_bounds[1]
-            return static_cast<int64_t>(K * exp_base);
+            // Step in log-scale after sqrt_bound
+            // Log base to give 5 steps for each order of magnitude
+            K *= std::pow(10.0, 0.2);
         }
+        return;
     }
 
-    static void test_fisher_yates_kolmogorov_smirnov(int64_t N, double significance, int64_t num_samples, uint32_t seed)
-    {
+    static void test_fisher_yates_kolmogorov_smirnov(int64_t N, double significance, int64_t num_samples, uint32_t seed) {
         int64_t K = 0;
         while (K <= N) {
             single_test_fisher_yates_kolmogorov_smirnov(N, K, significance, num_samples, seed);
-            K = special_increment_k(K, N);
+            incr_with_phase_transitions(K);
         }
     }
 
diff --git a/test/test_basic_rng/test_distortion.cc b/test/test_basic_rng/test_distortion.cc
new file mode 100644
index 00000000..dfbabedf
--- /dev/null
+++ b/test/test_basic_rng/test_distortion.cc
@@ -0,0 +1,159 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+using RandBLAS::DenseDist;
+using RandBLAS::DenseDistName;
+using RandBLAS::RNGState;
+
+#include "rng_common.hh"
+#include "../handrolled_lapack.hh"
+
+#include <iostream>
+#include <vector>
+#include <gtest/gtest.h>
+
+
+class TestSubspaceDistortion : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_general(DenseDistName name, T distortion, int64_t d, int64_t N, uint32_t key) {
+        auto layout = blas::Layout::ColMajor;
+        DenseDist D(d, N, name);
+        std::vector<T> S(d*N);
+        std::cout << "(d, N) = ( " << d << ", " << N << " )\n";
+        RandBLAS::RNGState<r123::Philox4x32> state(key);
+        auto next_state = RandBLAS::fill_dense(D, S.data(), state);
+        T inv_stddev = (name == DenseDistName::Gaussian) ? (T) 1.0 : (T) 1.0;
+        blas::scal(d*N, inv_stddev / std::sqrt(d), S.data(), 1);
+        std::vector<T> G(N*N, 0.0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, N, d, (T)1.0, S.data(), d, (T)0.0, G.data(), N);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, N, G.data(), N);
+        
+        std::vector<T> eigvecs(2*N, 0.0);
+        std::vector<T> subwork{};
+        T powermethod_reltol = 1e-2;
+        T powermethod_failprob = 1e-6;
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(
+            N, G.data(), eigvecs.data(), powermethod_reltol, powermethod_failprob, state, subwork
+        );
+        T sigma_max = std::sqrt(lambda_max);
+        T sigma_min = std::sqrt(lambda_min);
+        ASSERT_LE(sigma_max, 1+distortion);
+        ASSERT_GE(sigma_min, 1-distortion);
+        return;
+    }
+
+    template <typename T>
+    void run_gaussian(T distortion, T tau, T p_fail_bound, uint32_t key) {
+        /**
+         *  Generate a d-by-N random matrix, where d = gamma*N,
+         *  gamma = ((1 + tau)/delta)^2, and N is the smallest integer where n > N implies
+         *  n*(tau  - 1/sqrt(n))^2 >= 2*log(1/p_fail_bound).
+         *  One can verify that this value for N is given as
+         *      N = ceil( ([sqrt(2*log(1/p)) + 1]/ tau )^2 )
+         * 
+         * ----------------------
+         * Temporary notes
+         * ----------------------
+         * Find N = min{ n : exp(-t^2 gamma n ) <= p_fail_bound }, where
+         *       t := delta - (gamma)^{-1/2}(1 + 1/sqrt(n)), and 
+         *   gamma := ((1+tau)/delta)^2.
+         * 
+         * Choosing gamma of this form with tau > 0 ensures that no 
+         * matter the value of delta in (0, 1) there always an N
+         * so that probability bound holds whenever n >= N.
+         * 
+         */ 
+        double val = std::sqrt(-2 * std::log(p_fail_bound)) + 1;
+        val /= tau;
+        val *= val;
+        int64_t N = (int64_t) std::ceil(val);
+        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) * N );
+        run_general<T>(DenseDistName::Gaussian, distortion, d, N, key);
+        return;
+    }
+
+    template <typename T>
+    void run_uniform(T distortion, T rate, T p_fail_bound, uint32_t key) {
+        int64_t N = std::ceil(std::log((T)2 / p_fail_bound) / rate);
+        T c6 = 1.0; // definitely not high enough.
+        T epsnet_spectralnorm_factor = 1.0; // should be 4.0
+        T theta = epsnet_spectralnorm_factor * c6 * (rate + std::log(9));
+        int64_t d = std::ceil(N * theta * std::pow(distortion, -2));
+        run_general<T>(DenseDistName::Uniform, distortion, d, N, key);
+        return;
+    }
+};
+
+TEST_F(TestSubspaceDistortion, gaussian_rate_100_fail_0001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_gaussian<float>(0.50f, 1.0f, p_fail, key + i);
+        run_gaussian<float>(0.25f, 1.0f, p_fail, key + i);
+        run_gaussian<float>(0.10f, 1.0f, p_fail, key + i);
+    }
+}
+
+TEST_F(TestSubspaceDistortion, gaussian_rate_004_fail_0001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    float tau = 0.2f; // the convergence rate depends on tau^2.
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_gaussian<float>(0.75f, tau, p_fail, key + i);
+        run_gaussian<float>(0.50f, tau, p_fail, key + i);
+        run_gaussian<float>(0.25f, tau, p_fail, key + i);
+    }
+}
+
+TEST_F(TestSubspaceDistortion, uniform_rate_100_fail_0001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_uniform<float>(0.50f, 1.0f, p_fail, key + i);
+        run_uniform<float>(0.25f, 1.0f, p_fail, key + i);
+        run_uniform<float>(0.10f, 1.0f, p_fail, key + i);
+    }
+}
+
+TEST_F(TestSubspaceDistortion, uniform_rate_004_fail_0001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    float rate = 0.04;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_uniform<float>(0.50f, rate, p_fail, key + i);
+        run_uniform<float>(0.25f, rate, p_fail, key + i);
+        run_uniform<float>(0.10f, rate, p_fail, key + i);
+    }
+}
diff --git a/test/test_basic_rng/test_r123_kat.cc b/test/test_basic_rng/test_r123.cc
similarity index 71%
rename from test/test_basic_rng/test_r123_kat.cc
rename to test/test_basic_rng/test_r123.cc
index 14e6517e..da17842d 100644
--- a/test/test_basic_rng/test_r123_kat.cc
+++ b/test/test_basic_rng/test_r123.cc
@@ -31,10 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <gtest/gtest.h>
-#include <Random123/philox.h>
-#include <Random123/threefry.h>
-#include <Random123/ars.h>
-#include <Random123/aes.h>
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -42,9 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <errno.h>
 #include <assert.h>
 #include <math.h>
-#include <Random123/features/compilerfeatures.h>
-#include <sstream>
 
+#include <sstream>
 #include <iostream>
 #include <iomanip>
 #include <cassert>
@@ -52,21 +47,37 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cassert>
 #include <system_error>
 #include <cstdint>
-#include <Random123/uniform.hpp>
 #include <map>
 #include <string>
 
+#include <cstring>
+#include <utility>
+#include <stdexcept>
 
-#define LINESIZE 1024
+#include <RandBLAS/base.hh>
+#include <RandBLAS/random_gen.hh>
 
-#if R123_USE_AES_NI
-    int have_aesni = haveAESNI();
-#else
-    int have_aesni = 0;
+#include <Random123/philox.h>
+#include <Random123/threefry.h>
+#include <Random123/uniform.hpp>
+#include <Random123/features/compilerfeatures.h>
+#include <Random123/MicroURNG.hpp>
+#include <Random123/conventional/Engine.hpp>
+
+
+#define LINESIZE 1024
+#ifdef _MSC_FULL_VER
+#define strtoull _strtoui64
+// ^ Needed to define the strtou32 and strtou64 functions.
+#pragma warning (disable : 4521)
+// ^ Engines have multiple copy constructors, quite legal C++, disable MSVC complaint
 #endif
+
 int verbose = 0;
 int debug = 0;
 
+// MARK: I/O and conversions
+
 /* strdup may or may not be in string.h, depending on the value
    of the pp-symbol _XOPEN_SOURCE and other arcana.  Just
    do it ourselves.
@@ -84,9 +95,6 @@ char *ntcsdup(const char *s){
 // Specifically, they strip leading whitespace, and then they stop reading as
 // soon as they reach a non-numeric character. (Note that the "a" in 257a3673
 // counts as a numeric character if we're reading in hexadecimal format.)
-#ifdef _MSC_FULL_VER
-#define strtoull _strtoui64
-#endif
 uint32_t strtou32(const char *p, char **endp, int base){
     uint32_t ret;
     errno = 0;
@@ -108,8 +116,6 @@ void prtu(std::ostream& os, T val) {
     os << std::hex << std::setw(std::numeric_limits<T>::digits / 4) << std::setfill('0') << val;
     assert(!os.bad());
 }
-
-// Specializations for uint32_t and uint64_t
 void prtu32(std::ostream& os, uint32_t v) { prtu(os, v); }
 void prtu64(std::ostream& os, uint64_t v) { prtu(os, v); }
 
@@ -136,6 +142,38 @@ do { \
     fflush(fp); \
 } while(0)
 
+// MARK: Base generator test
+//
+// There's a lot of code involved in this test. The code can roughly
+// be broken down into three categories.
+//
+// Category 1: code generated from compiler directives
+//
+//   This pattern is left over from our adaptation of Random123 tests,
+//   which have to compile whether interpreted as C or C++ source. It
+//   uses compiler directives to accomplish what something roughly 
+//   equivalent to C++ templating and metaprogramming.
+//
+//   The code specifically generates the following identifiers.
+//
+//      method_e::<GEN>NxW_e        (enum members)
+//      <GEN>NxW_kat                (structs)
+//      kat_instance.<GEN>NxW_data  (members of type <GEN>NxW_kat)
+//      read_<GEN>NxW               (functions)
+//      report_<GEN>NxWerror        (functions)
+//
+// Category 2: helper functions
+//
+//   The base_rng_test_[arrange,act,assert] functions are slight adaptations
+//   of functions that appeared in Random123 testing infrastructure. Their 
+//   names indicte their roles in the common "arrange, act, assert" pattern of
+//   writing unit tests. Their precise descriptions are complicated.
+//   
+// Category 3: the main runner
+//
+//   This manages all calls to the helper functions defined in Category 2.
+//
+
 enum method_e{
 #define RNGNxW_TPL(base, N, W) base##N##x##W##_e,
 #include "r123_rngNxW.mm"
@@ -144,56 +182,28 @@ enum method_e{
 };
 
 #define RNGNxW_TPL(base, N, W)                       \
-    typedef struct {                                 \
+    struct base##N##x##W##_kat {                     \
         base##N##x##W##_ctr_t ctr;                   \
         base##N##x##W##_ukey_t ukey;                 \
         base##N##x##W##_ctr_t expected;              \
         base##N##x##W##_ctr_t computed;              \
-    } base##N##x##W##_kat;
+    };
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-typedef struct{
+struct kat_instance {
     enum method_e method;
     unsigned nrounds;
     union{
 #define RNGNxW_TPL(base, N, W) base##N##x##W##_kat base##N##x##W##_data;
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
-	/* Sigh... For those platforms that lack uint64_t, carve
-	   out 128 bytes for the counter, key, expected, and computed. */
+	// Sigh... For those platforms that lack uint64_t, carve
+	//   out 128 bytes for the counter, key, expected, and computed.
 	char justbytes[128]; 
     }u;
-} kat_instance;
-
-void host_execute_tests(kat_instance *tests, unsigned ntests);
-                
-/* Keep track of the test vectors that we don't know how to deal with: */
-#define MAXUNKNOWNS 20
-
-struct UnknownKatTracker {
-    int num_unknowns = 0;
-    const char *unknown_names[MAXUNKNOWNS];
-    int unknown_counts[MAXUNKNOWNS];
 };
 
-void register_unknown(UnknownKatTracker &ukt, const char *name){
-    int i;
-    for(i=0; i< ukt.num_unknowns; ++i){
-        if( strcmp(name, ukt.unknown_names[i]) == 0 ){
-            ukt.unknown_counts[i]++;
-            return;
-        }
-    }
-    if( i >= MAXUNKNOWNS ){
-        FAIL() << "Too many unknown rng types. Bye.\n";
-    }
-    ukt.num_unknowns++;
-    ukt.unknown_names[i] = ntcsdup(name);
-    ukt.unknown_counts[i] = 1;
-}
-
-/* read_<GEN>NxW */
 #define RNGNxW_TPL(base, N, W) \
 int read_##base##N##x##W(const char *line, kat_instance* tinst){        \
     size_t i;                                                           \
@@ -223,29 +233,6 @@ int read_##base##N##x##W(const char *line, kat_instance* tinst){        \
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-
-/* readtest:  dispatch to one of the read_<GEN>NxW functions */
-static int readtest(UnknownKatTracker &ukt, const char *line, kat_instance* tinst){
-    int nchar;
-    char name[LINESIZE];
-    if( line[0] == '#') return 0;                                       
-    sscanf(line, "%s%n", name, &nchar);
-    if(!have_aesni){
-        /* skip any tests that require AESNI */ 
-        if(strncmp(name, "aes", 3)==0 ||
-           strncmp(name, "ars", 3)==0){
-            register_unknown(ukt, name);
-            return 0;
-        }
-    }
-#define RNGNxW_TPL(base, N, W) if(strcmp(name, #base #N "x" #W) == 0) return read_##base##N##x##W(line+nchar, tinst);
-#include "r123_rngNxW.mm"
-#undef RNGNxW_TPL
-
-    register_unknown(ukt, name);
-    return 0;
-}
-
 #define RNGNxW_TPL(base, N, W) \
 void report_##base##N##x##W##error(int &nfailed, const kat_instance *ti){ \
  size_t i;                                                     \
@@ -282,134 +269,61 @@ void report_##base##N##x##W##error(int &nfailed, const kat_instance *ti){ \
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-// dispatch to one of the report_<GEN>NxW() functions
-void analyze_tests(int &nfailed, const kat_instance *tests, unsigned ntests){
-    unsigned i;
-    char zeros[512] = {0};
-    for(i=0; i<ntests; ++i){
-        const kat_instance *ti = &tests[i];
-        switch(tests[i].method){
-#define RNGNxW_TPL(base, N, W) case base##N##x##W##_e: \
-            if (memcmp(zeros, ti->u.base##N##x##W##_data.expected.v, N*W/8)==0){ \
-                FAIL() << "kat expected all zeros?   Something is wrong with the test harness!\n"; \
-                nfailed++; \
-            } \
-            if (memcmp(ti->u.base##N##x##W##_data.computed.v, ti->u.base##N##x##W##_data.expected.v, N*W/8)) \
-		report_##base##N##x##W##error(nfailed, ti); \
-	    break;
-#include "r123_rngNxW.mm"
-#undef RNGNxW_TPL
-        case last: ;
-        }
-    }
-}
-
-#define NTESTS 1000
-
-void run_base_rng_kat() {
-    kat_instance *tests;
-    unsigned t, ntests = NTESTS;
-    char linebuf[LINESIZE];
-    FILE *inpfile;
-    const char *p;
-    const char *inname;
-    int nfailed = 0;
-
-    UnknownKatTracker ukt{};
-
-    inname = "./kat_vectors.txt";
-	inpfile = fopen(inname, "r");
-	if (inpfile == NULL)
-        FAIL() << "Error opening input file " << inname << " for reading. Received error code " << errno << "\n";
-
-    if ((p = getenv("KATC_VERBOSE")) != NULL)
-	    verbose = atoi(p);
-  
-    if ((p = getenv("KATC_DEBUG")) != NULL)
-	    debug = atoi(p);
+struct UnknownKatTracker {
+    const static int MAXUNKNOWNS = 20;
+    int num_unknowns = 0;
+    const char *unknown_names[MAXUNKNOWNS];
+    int unknown_counts[MAXUNKNOWNS];
+};
 
-    tests = (kat_instance *) malloc(sizeof(tests[0])*ntests);
-    if (tests == NULL) {
-        FAIL() << "Could not allocate " << (unsigned long) ntests << " bytes for tests\n";
-    }
-    t = 0;
-    while (fgets(linebuf, sizeof linebuf, inpfile) != NULL) {
-        if( t == ntests ) {
-            ntests *= 2;
-            tests = (kat_instance *)realloc(tests, sizeof(tests[0])*ntests);
-            if (tests == NULL) {
-                FAIL() << "Could not grow tests to " << (unsigned long) ntests << " bytes.\n";
-            }
+void register_unknown(UnknownKatTracker &ukt, const char *name){
+    int i;
+    for(i=0; i< ukt.num_unknowns; ++i){
+        if( strcmp(name, ukt.unknown_names[i]) == 0 ){
+            ukt.unknown_counts[i]++;
+            return;
         }
-        if( readtest(ukt, linebuf, &tests[t]) )
-            ++t;
     }
-    if(t==ntests){
-	    FAIL() << "No more space for tests?  Recompile with a larger NTESTS\n";
+    if( i >= ukt.MAXUNKNOWNS ){
+        FAIL() << "Too many unknown rng types. Bye.\n";
     }
-    tests[t].method = last; // N.B  *not* t++ - the 'ntests' value passed to host_execute_tests does not count the 'last' one.
+    ukt.num_unknowns++;
+    ukt.unknown_names[i] = ntcsdup(name);
+    ukt.unknown_counts[i] = 1;
+}
 
-    for(int i=0; i< ukt.num_unknowns; ++i){
-        printf("%d test vectors of type %s skipped\n", ukt.unknown_counts[i], ukt.unknown_names[i]);
+void base_rng_test_arrange(const char *line, kat_instance* tinst, UnknownKatTracker &ukt, bool &flag){
+    int nchar;
+    char name[LINESIZE];
+    if( line[0] == '#') {
+        flag = false;
+        return;
+    }                                       
+    sscanf(line, "%s%n", name, &nchar);
+    /* skip any tests that require AESNI */ 
+    if(strncmp(name, "aes", 3)==0 || strncmp(name, "ars", 3)==0){
+        register_unknown(ukt, name);
+        flag = false;
+        return;
     }
-    printf("Perform %lu tests.\n", (unsigned long)t);
-    host_execute_tests(tests, t);
+#define RNGNxW_TPL(base, N, W) \
+    if(strcmp(name, #base #N "x" #W) == 0)  {                    \
+        flag = (bool) read_##base##N##x##W(line+nchar, tinst);   \
+        return;                                                  \
+    }                                                            
+#include "r123_rngNxW.mm"
+#undef RNGNxW_TPL
 
-    analyze_tests(nfailed, tests, t);
-    free(tests);
-    if(nfailed != 0)
-        FAIL() << "Failed " << nfailed << " out of " << t << std::endl;
+    register_unknown(ukt, name);
+    flag = false;
     return;
 }
 
-
-
-// With C++, it's a little trickier to create the mapping from
-// method-name/round-count to functions
-// because the round-counts are template arguments that have to be
-// specified at compile-time.  Thus, we can't just do #define RNGNxW_TPL
-// and #include "r123_rngNxW.mm".  We have to build a static map from:
-//  pair<generator, rounds> to functions that apply the right generator
-// with the right number of rounds.
-
-#ifdef _MSC_FULL_VER
-// Engines have multiple copy constructors, quite legal C++, disable MSVC complaint
-#pragma warning (disable : 4521)
-#endif
-
-#include <map>
-#include <cstring>
-#include <utility>
-#include <stdexcept>
-#include <Random123/MicroURNG.hpp>
-#include <Random123/conventional/Engine.hpp>
-
-using namespace std;
-
-typedef map<pair<method_e, unsigned>, void (*)(kat_instance *)> genmap_t;
-genmap_t genmap;
-
-void dev_execute_tests(kat_instance *tests, unsigned ntests){
-    unsigned i;
-    for(i=0; i<ntests; ++i){
-        kat_instance *ti = &tests[i];
-        genmap_t::iterator p = genmap.find(make_pair(ti->method, ti->nrounds));
-        if(p == genmap.end())
-            throw std::runtime_error("pair<generator, nrounds> not in map.  You probably need to add more genmap entries in kat_cpp.cpp");
-
-        p->second(ti);
-        // TODO: check that the corresponding Engine and MicroURNG
-        //  return the same values.  Note that we have ut_Engine and
-        //  ut_MicroURNG, which check basic functionality, but they
-        //  don't have the breadth of the kat_vectors.
-    }
-}
-
 static int murng_reported;
 static int engine_reported;
 
 template <typename GEN>
-void do_test(kat_instance* ti){
+void base_rng_test_act(kat_instance* ti){
     GEN g;
     struct gdata{
         typename GEN::ctr_type ctr;
@@ -419,7 +333,7 @@ void do_test(kat_instance* ti){
     };
     gdata data;
     // use memcpy.  A reinterpret_cast would violate strict aliasing.
-    memcpy(&data, &ti->u, sizeof(data));
+    std::memcpy(&data, &ti->u, sizeof(data));
     data.computed = g(data.ctr, data.ukey);
 
     // Before we return, let's make sure that MicroURNG<GEN,1> and
@@ -452,7 +366,7 @@ void do_test(kat_instance* ti){
             errs++;
     }
     if(errs && (murng_reported++ == 0))
-        cerr << "Error in MicroURNG<GEN>, will appear as \"computed\" value of zero in error summary\n";
+        std::cerr << "Error in MicroURNG<GEN>, will appear as \"computed\" value of zero in error summary\n";
 
     // Engine
     // N.B.  exercising discard() arguably belongs in ut_Engine.cpp
@@ -499,61 +413,145 @@ void do_test(kat_instance* ti){
 	value_type val = e();
 	size_t j = data.expected.size() - i - 1;
 	if (data.expected[j] != val) {
-            cerr << hex;
-            cerr << "Engine check, j=" << j << " expected: " << data.expected[j] << " val: " << val << "\n";
+            std::cerr << std::hex;
+            std::cerr << "Engine check, j=" << j << " expected: " << data.expected[j] << " val: " << val << "\n";
 	    errs++;
             if(engine_reported++ == 0)
-                cerr << "Error in Engine<GEN, 1>, will appear as \"computed\" value of zero in error summary\n";
+                std::cerr << "Error in Engine<GEN, 1>, will appear as \"computed\" value of zero in error summary\n";
 	}
     }
 
     // Signal an error to the caller by *not* copying back
     // the computed data object into the ti
     if(errs == 0)
-        memcpy(&ti->u, &data, sizeof(data));
+        std::memcpy(&ti->u, &data, sizeof(data));
 }
 
-void host_execute_tests(kat_instance *tests, unsigned ntests){
-    // In C++1x, this could be staticly declared with an initializer list.
-    genmap[make_pair(threefry2x32_e, 13u)] = do_test<r123::Threefry2x32_R<13> >;
-    genmap[make_pair(threefry2x32_e, 20u)] = do_test<r123::Threefry2x32_R<20> >;
-    genmap[make_pair(threefry2x32_e, 32u)] = do_test<r123::Threefry2x32_R<32> >;
-#if R123_USE_64BIT
-    genmap[make_pair(threefry2x64_e, 13u)] = do_test<r123::Threefry2x64_R<13> >;
-    genmap[make_pair(threefry2x64_e, 20u)] = do_test<r123::Threefry2x64_R<20> >;
-    genmap[make_pair(threefry2x64_e, 32u)] = do_test<r123::Threefry2x64_R<32> >;
-#endif
+void base_rng_test_assert(int &nfailed, const kat_instance *tests, unsigned ntests){
+    unsigned i;
+    char zeros[512] = {0};
+    for(i=0; i<ntests; ++i){
+        const kat_instance *ti = &tests[i];
+        switch(tests[i].method){
+#define RNGNxW_TPL(base, N, W) case base##N##x##W##_e: \
+            if (memcmp(zeros, ti->u.base##N##x##W##_data.expected.v, N*W/8)==0){ \
+                FAIL() << "kat expected all zeros?   Something is wrong with the test harness!\n"; \
+                nfailed++; \
+            } \
+            if (memcmp(ti->u.base##N##x##W##_data.computed.v, ti->u.base##N##x##W##_data.expected.v, N*W/8)) \
+		report_##base##N##x##W##error(nfailed, ti); \
+	    break;
+#include "r123_rngNxW.mm"
+#undef RNGNxW_TPL
+        case last: ;
+        }
+    }
+}
 
-    genmap[make_pair(threefry4x32_e, 13u)] = do_test<r123::Threefry4x32_R<13> >;
-    genmap[make_pair(threefry4x32_e, 20u)] = do_test<r123::Threefry4x32_R<20> >;
-    genmap[make_pair(threefry4x32_e, 72u)] = do_test<r123::Threefry4x32_R<72> >;
-#if R123_USE_64BIT
-    genmap[make_pair(threefry4x64_e, 13u)] = do_test<r123::Threefry4x64_R<13> >;
-    genmap[make_pair(threefry4x64_e, 20u)] = do_test<r123::Threefry4x64_R<20> >;
-    genmap[make_pair(threefry4x64_e, 72u)] = do_test<r123::Threefry4x64_R<72> >;
-#endif
+void run_all_base_rng_kats() {
+    kat_instance *tests;
+    unsigned t, ntests = 1000;
+    char linebuf[LINESIZE];
+    FILE *inpfile;
+    const char *p;
+    const char *inname;
+    int nfailed = 0;
 
-    genmap[make_pair(philox2x32_e, 7u)] = do_test<r123::Philox2x32_R<7> >;
-    genmap[make_pair(philox2x32_e, 10u)] = do_test<r123::Philox2x32_R<10> >;
-    genmap[make_pair(philox4x32_e, 7u)] = do_test<r123::Philox4x32_R<7> >;
-    genmap[make_pair(philox4x32_e, 10u)] = do_test<r123::Philox4x32_R<10> >;
+    UnknownKatTracker ukt{};
 
-#if R123_USE_PHILOX_64BIT
-    genmap[make_pair(philox2x64_e, 7u)] = do_test<r123::Philox2x64_R<7> >;
-    genmap[make_pair(philox2x64_e, 10u)] = do_test<r123::Philox2x64_R<10> >;
-    genmap[make_pair(philox4x64_e, 7u)] = do_test<r123::Philox4x64_R<7> >;
-    genmap[make_pair(philox4x64_e, 10u)] = do_test<r123::Philox4x64_R<10> >;
-#endif
+    inname = "./r123_kat_vectors.txt";
+	inpfile = fopen(inname, "r");
+	if (inpfile == NULL)
+        FAIL() << "Error opening input file " << inname << " for reading. Received error code " << errno << "\n";
 
-#if R123_USE_AES_NI
-    genmap[make_pair(aesni4x32_e, 10u)] = do_test<r123::AESNI4x32 >;
-    genmap[make_pair(ars4x32_e, 7u)] = do_test<r123::ARS4x32_R<7> >;
-    genmap[make_pair(ars4x32_e, 10u)] = do_test<r123::ARS4x32_R<10> >;
-#endif
+    if ((p = getenv("KATC_VERBOSE")) != NULL)
+	    verbose = atoi(p);
+  
+    if ((p = getenv("KATC_DEBUG")) != NULL)
+	    debug = atoi(p);
+
+    tests = (kat_instance *) malloc(sizeof(tests[0])*ntests);
+    if (tests == NULL) {
+        FAIL() << "Could not allocate " << (unsigned long) ntests << " bytes for tests\n";
+    }
+    t = 0;
+    while (fgets(linebuf, sizeof linebuf, inpfile) != NULL) {
+        if( t == ntests ) {
+            ntests *= 2;
+            tests = (kat_instance *)realloc(tests, sizeof(tests[0])*ntests);
+            if (tests == NULL) {
+                FAIL() << "Could not grow tests to " << (unsigned long) ntests << " bytes.\n";
+            }
+        }
+        bool flag = false;
+        base_rng_test_arrange(linebuf, &tests[t], ukt, flag);
+        if( flag )
+            ++t;
+    }
+    if(t==ntests){
+	    FAIL() << "No more space for tests?  Recompile with a larger ntests\n";
+    }
+    tests[t].method = last;
+
+    for(int i=0; i< ukt.num_unknowns; ++i){
+        printf("%d test vectors of type %s skipped\n", ukt.unknown_counts[i], ukt.unknown_names[i]);
+    }
+    printf("Perform %lu tests.\n", (unsigned long)t);
+    using std::map;
+    using std::pair;
+    using std::make_pair;
+    typedef map<pair<method_e, unsigned>, void (*)(kat_instance *)> genmap_t;
+    genmap_t genmap;
+    // In C++1x, this could be staticly declared with an initializer list.
+    genmap[make_pair(threefry2x32_e, 13u)] = base_rng_test_act<r123::Threefry2x32_R<13> >;
+    genmap[make_pair(threefry2x32_e, 20u)] = base_rng_test_act<r123::Threefry2x32_R<20> >;
+    genmap[make_pair(threefry2x32_e, 32u)] = base_rng_test_act<r123::Threefry2x32_R<32> >;
+
+    genmap[make_pair(threefry4x32_e, 13u)] = base_rng_test_act<r123::Threefry4x32_R<13> >;
+    genmap[make_pair(threefry4x32_e, 20u)] = base_rng_test_act<r123::Threefry4x32_R<20> >;
+    genmap[make_pair(threefry4x32_e, 72u)] = base_rng_test_act<r123::Threefry4x32_R<72> >;
+
+    #if R123_USE_64BIT
+    genmap[make_pair(threefry2x64_e, 13u)] = base_rng_test_act<r123::Threefry2x64_R<13> >;
+    genmap[make_pair(threefry2x64_e, 20u)] = base_rng_test_act<r123::Threefry2x64_R<20> >;
+    genmap[make_pair(threefry2x64_e, 32u)] = base_rng_test_act<r123::Threefry2x64_R<32> >;
+
+    genmap[make_pair(threefry4x64_e, 13u)] = base_rng_test_act<r123::Threefry4x64_R<13> >;
+    genmap[make_pair(threefry4x64_e, 20u)] = base_rng_test_act<r123::Threefry4x64_R<20> >;
+    genmap[make_pair(threefry4x64_e, 72u)] = base_rng_test_act<r123::Threefry4x64_R<72> >;
+    #endif
+
+    genmap[make_pair(philox2x32_e, 7u)] = base_rng_test_act<r123::Philox2x32_R<7> >;
+    genmap[make_pair(philox2x32_e, 10u)] = base_rng_test_act<r123::Philox2x32_R<10> >;
+    genmap[make_pair(philox4x32_e, 7u)] = base_rng_test_act<r123::Philox4x32_R<7> >;
+    genmap[make_pair(philox4x32_e, 10u)] = base_rng_test_act<r123::Philox4x32_R<10> >;
+
+    #if R123_USE_PHILOX_64BIT
+    genmap[make_pair(philox2x64_e, 7u)] = base_rng_test_act<r123::Philox2x64_R<7> >;
+    genmap[make_pair(philox2x64_e, 10u)] = base_rng_test_act<r123::Philox2x64_R<10> >;
+    genmap[make_pair(philox4x64_e, 7u)] = base_rng_test_act<r123::Philox4x64_R<7> >;
+    genmap[make_pair(philox4x64_e, 10u)] = base_rng_test_act<r123::Philox4x64_R<10> >;
+    #endif
+
+    unsigned i;
+    for(i=0; i<t; ++i){
+        kat_instance *ti = &tests[i];
+        genmap_t::iterator p = genmap.find(make_pair(ti->method, ti->nrounds));
+        if(p == genmap.end())
+            throw std::runtime_error("pair<generator, nrounds> not in map.  You probably need to add more genmap entries.");
 
-    dev_execute_tests(tests, ntests);
+        p->second(ti);
+        // ^ That prepares the test data
+    }
+
+    base_rng_test_assert(nfailed, tests, t);
+    free(tests);
+    if(nfailed != 0)
+        FAIL() << "Failed " << nfailed << " out of " << t << std::endl;
+    return;
 }
 
+// MARK: histogram test
 
 using namespace r123;
 
@@ -567,30 +565,6 @@ typename r123::make_signed<T>::type S(T x){ return x; }
         chk<Ftype, Rng>(#u, #Rng, #Ftype, &u<Ftype, Rng::ctr_type::value_type>, _nfail_, _refhist_); \
     }while(0)
 
-std::map<std::string, std::string> get_refmap(){
-    using std::string;
-    std::map<string, string> refmap{};
-    refmap[string("u01 Threefry4x32 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x32 double")]= string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x32 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x64 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01 Threefry4x64 double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01 Threefry4x64 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("uneg11 Threefry4x32 float")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x32 double")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x32 long double")] = string( " 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x64 float")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("uneg11 Threefry4x64 double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("uneg11 Threefry4x64 long double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("u01fixedpt Threefry4x32 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x32 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x32 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x64 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01fixedpt Threefry4x64 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01fixedpt Threefry4x64 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    return refmap;
-}
-
 template<typename Ftype, typename RNG, typename Utype>
 void chk(const std::string& fname, const std::string& rngname, const std::string& ftypename, Utype f, int &nfail, std::map<std::string, std::string> &refmap){
     std::string key = fname + " " + rngname + " " + ftypename;
@@ -632,8 +606,32 @@ void chk(const std::string& fname, const std::string& rngname, const std::string
     }
 }
 
+std::map<std::string, std::string> get_ut_uniform_refmap(){
+    using std::string;
+    std::map<string, string> refmap{};
+    refmap[string("u01 Threefry4x32 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x32 double")]= string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x32 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x64 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01 Threefry4x64 double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01 Threefry4x64 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("uneg11 Threefry4x32 float")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x32 double")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x32 long double")] = string( " 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x64 float")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("uneg11 Threefry4x64 double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("uneg11 Threefry4x64 long double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("u01fixedpt Threefry4x32 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x32 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x32 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x64 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01fixedpt Threefry4x64 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01fixedpt Threefry4x64 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    return refmap;
+}
+
 void run_ut_uniform(){
-    auto refmap = get_refmap();
+    auto refmap = get_ut_uniform_refmap();
     int nfail = 0;
     // 18 tests:  3 functions (u01, uneg11, u01fixedpt)
     //          x 2 input sizes (32 bit or 64 bit)
@@ -672,12 +670,110 @@ void run_ut_uniform(){
     return;
 }
 
-class TestRandom123KnownAnswers : public ::testing::Test { };
+// MARK: my tests + Googletest
 
-TEST_F(TestRandom123KnownAnswers, base_generators) {
-    run_base_rng_kat();
+class TestRandom123 : public ::testing::Test { 
+
+    protected:
+    
+    static void test_incr() {
+        using RNG = r123::Philox4x32;
+        RandBLAS::RNGState<RNG> s(0);
+        // The "counter" array of s is a 4*32=128 bit unsigned integer.
+        //
+        //      Each block is interpreted in the usual way (i.e., no need to consider differences
+        //      between big-endian and little-endian representations). 
+        //
+        //      Looking across blocks, we read as as a little-endian number in base IMAX = 2^32 - 1.
+        //      That is, if we initialize s.counter = {0,0,0,0} and then call s.counter.incr(IMAX),
+        //      we should have s.counter = {IMAX, 0, 0, 0}, and if we make another call 
+        //      s.counter.incr(9), then we should see s.counter = {8, 1, 0, 0}. Put another way,
+        //      if c = s.counter, then we have
+        //
+        //        (128-bit integer) c == c[0] + 2^{32}*c[1] +  2^{64}*c[2] + 2^{96}*c[3]  (mod 2^128 - 1)
+        //
+        //       where 0 <= c[i] <= IMAX
+        //
+        uint64_t i32max = std::numeric_limits<uint32_t>::max();
+        auto c = s.counter;
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(i32max);
+        ASSERT_EQ(c[0], i32max);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(1);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(3);
+        ASSERT_EQ(c[0], 3);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        uint64_t two32  = ((uint64_t) 1) << 32;
+
+        c = {0,0,0,0};
+        c.incr(two32-1);
+        ASSERT_EQ(c[0], i32max);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c = {0,0,0,0};
+        c.incr(two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        // Let's construct 2^32 * (2^32 - 1), which is equal to (ctr_type) {0, (uint32_t) i32max, 0, 0}.
+        //
+        //  Do this using the identity 
+        //      2^32 * (2^32 - 1) == 2^64 - 2^32
+        //                        == 2^63 + 2^63 - 2^32.
+        //
+        // Then construct 2^64, using 2^64 = (2^63) + (2^63 - 2^32) + (2^32)
+        uint64_t two63  = ((uint64_t) 1) << 63;
+        c = {0,0,0,0};
+        c.incr(two63);
+        c.incr(two63 - two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], i32max);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+        c.incr(two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 1);
+        ASSERT_EQ(c[3], 0);
+
+        c = {(uint32_t) i32max, (uint32_t) i32max, (uint32_t) i32max, 0};
+        c.incr(1);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 1);
+        return;
+    }
+};
+
+TEST_F(TestRandom123, base_generators) {
+    run_all_base_rng_kats();
 }
 
-TEST_F(TestRandom123KnownAnswers, uniform_histograms) {
+TEST_F(TestRandom123, uniform_histograms) {
     run_ut_uniform();
 }
+
+TEST_F(TestRandom123, big_incr) {
+    test_incr();
+}
\ No newline at end of file
diff --git a/test/test_datastructures/test_denseskop.cc b/test/test_datastructures/test_denseskop.cc
index c7ae4544..884243be 100644
--- a/test/test_datastructures/test_denseskop.cc
+++ b/test/test_datastructures/test_denseskop.cc
@@ -116,7 +116,7 @@ class TestDenseMoments : public ::testing::Test {
         // Construct the sketching operator
         RandBLAS::DenseDist D(n_rows, n_cols, dn);
         auto state = RandBLAS::RNGState(key);
-        auto [layout, next_state] = RandBLAS::fill_dense(D, A.data(), state);
+        auto next_state = RandBLAS::fill_dense(D, A.data(), state);
 
         // Compute the entrywise empirical mean and standard deviation.
         T mean = std::accumulate(A.data(), A.data() + size, 0.0) /size;
@@ -149,7 +149,7 @@ TEST_F(TestDenseMoments, Gaussian)
 TEST_F(TestDenseMoments, Uniform)
 {
     auto dn = RandBLAS::DenseDistName::Uniform;
-    double expect_stddev = 1.0 / sqrt(3.0);
+    double expect_stddev = 1.0;
     for (uint32_t key : {0, 1, 2})
     {
         test_mean_stddev<float>(key, 500, 500, dn, (float) expect_stddev);
@@ -182,7 +182,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_srows * n_scols];
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0; // used for indexing smat when comparing to rmat
-        T total_error = 0;
         for (int nptr = ptr; nptr < n_cols*(n_rows-n_srows-1); nptr += stride*n_cols) {
             // ^ Loop through various pointer locations.- goes down the random matrix by amount stride.
             RandBLAS::dense::fill_dense_submat_impl<T,RNG,OP>(n_cols, smat, n_srows, n_scols, nptr, seed);
@@ -190,14 +189,13 @@ class TestSubmatGeneration : public ::testing::Test
             for (int i = 0; i<n_srows; i++) {
                 // ^ Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[nptr + i*n_cols + j]); 
+                    ASSERT_EQ(smat[ind], mat[nptr + i*n_cols + j]);
                     ind++;
                 }
             }
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0); //Submatrices are equal if each entry is bitwise the same
     }
 
     template<typename T, typename RNG, typename OP>
@@ -214,7 +212,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_srows * n_scols];
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0; // variable used for indexing smat when comparing to rmat
-        T total_error = 0;
         for (int nptr = ptr; nptr < (n_cols - n_scols - 1); nptr += stride) {
             // ^ Loop through various pointer locations.- goes across the random matrix by amount stride.
             RandBLAS::dense::fill_dense_submat_impl<T,RNG,OP>(n_cols, smat, n_srows, n_scols, nptr, seed);
@@ -222,14 +219,13 @@ class TestSubmatGeneration : public ::testing::Test
             for (int i = 0; i<n_srows; i++) {
                 // ^ Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[nptr + i*n_cols + j]);
+                    ASSERT_EQ(smat[ind], mat[nptr + i*n_cols + j]);
                     ind++;
                 }
             }
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0);
     }
 
     template<typename T, typename RNG, typename OP>
@@ -242,7 +238,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_rows * n_cols]{};
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0;
-        T total_error = 0;
         int64_t n_scols = 1;
         int64_t n_srows = 1;
         for (int ptr = 0; ptr + n_scols + n_cols*n_srows < n_cols*n_rows; ptr += n_rows+1) { // Loop through the diagonal of the matrix
@@ -251,7 +246,7 @@ class TestSubmatGeneration : public ::testing::Test
             ind = 0;
             for (int i = 0; i<n_srows; i++) { // Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[ptr + i*n_cols + j]);
+                    ASSERT_EQ(smat[ind], mat[ptr + i*n_cols + j]);
                     ind++;
                 }
             }
@@ -260,7 +255,6 @@ class TestSubmatGeneration : public ::testing::Test
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0);
     }
 
 };
@@ -384,177 +378,112 @@ class TestFillAxis : public::testing::Test
 
 };
 
-TEST_F(TestFillAxis, long_axis_3x5) {
+TEST_F(TestFillAxis, autotranspose_long_axis_3x5) {
     auto_transpose<float>(3, 5, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_3x5) {
+TEST_F(TestFillAxis, autotranspose_short_axis_3x5) {
     auto_transpose<float>(3, 5, RandBLAS::MajorAxis::Short);
 }
 
-TEST_F(TestFillAxis, long_axis_4x8) {
+TEST_F(TestFillAxis, autotranspose_long_axis_4x8) {
     auto_transpose<float>(4, 8, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_4x8) {
+TEST_F(TestFillAxis, autotranspose_short_axis_4x8) {
     auto_transpose<float>(4, 8, RandBLAS::MajorAxis::Short);
 }
 
-TEST_F(TestFillAxis, long_axis_2x4) {
+TEST_F(TestFillAxis, autotranspose_long_axis_2x4) {
     auto_transpose<float>(2, 4, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_2x4) {
+TEST_F(TestFillAxis, autotranspose_short_axis_2x4) {
     auto_transpose<float>(2, 4, RandBLAS::MajorAxis::Short);
 }
 
-
-class TestStateUpdate : public ::testing::Test
+class TestDenseSkOpStates : public ::testing::Test
 {
     protected:
 
-    virtual void SetUp(){};
-
-    virtual void TearDown(){};
-
-    template <typename T>
-    static void test_var_mat_gen(
-        uint32_t key,
-        int64_t n_rows,
-        int64_t n_cols,
-        RandBLAS::DenseDistName dn
-    ) {
-        // Allocate workspace
-        int64_t size = n_rows * n_cols;
-        std::vector<T> A(size, 0.0);
-        std::vector<T> B(size, 0.0);
-
-        // Construct the sketching operator
-        RandBLAS::DenseDist D(n_rows, n_cols, dn);
-
-        auto state = RandBLAS::RNGState(key);
-        auto [layout, next_state] = RandBLAS::fill_dense(D, A.data(), state);
-        RandBLAS::fill_dense(D, B.data(), next_state);
-
-        ASSERT_TRUE(!(A == B));
-    }
-
     template <typename T>
-    static void test_identity(
+    static void test_concatenate_along_columns(
         uint32_t key,
         int64_t n_rows,
         int64_t n_cols,
         RandBLAS::DenseDistName dn
     ) {
-        // Allocate workspace
+        randblas_require(n_rows > n_cols);
+        RandBLAS::DenseDist D1(    n_rows, n_cols/2,          dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::DenseDist D2(    n_rows, n_cols - n_cols/2, dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::DenseDist Dfull( n_rows, n_cols,            dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::RNGState state(key);
         int64_t size = n_rows * n_cols;
-        std::vector<T> A(size, 0.0);
-        std::vector<T> B(size, 0.0);
-
-        double total = 0.0;
-
-        // Construct the sketching operator
-        RandBLAS::DenseDist D1(
-            n_rows,
-            n_cols / 2,
-            dn
-        );
-
-        RandBLAS::DenseDist D3(
-            n_rows,
-            n_cols - n_cols / 2,
-            dn
-        );
-
-        RandBLAS::DenseDist D2(
-            n_rows,
-            n_cols,
-            dn
-        );
-
-        auto state = RandBLAS::RNGState(key);
-        auto state1 = RandBLAS::RNGState(key);
 
         // Concatenates two matrices generated from state and next_state
-        auto [layout, next_state] = RandBLAS::fill_dense(D1, A.data(), state);
-        RandBLAS::fill_dense(D3, A.data() + (int64_t) (D1.n_rows * D1.n_cols), next_state);
-
-        RandBLAS::fill_dense(D2, B.data(), state1);
+        std::vector<T> A(size, 0.0);
+        RandBLAS::DenseSkOp<T> S1(D1, state);
+        RandBLAS::DenseSkOp<T> S2(D2, S1.next_state);
+        RandBLAS::fill_dense(S1);
+        RandBLAS::fill_dense(S2);
+        int64_t size_d1 = n_rows * D1.n_cols;
+        blas::copy(size_d1, S1.buff, 1, A.data(), 1);
+        int64_t size_d2 = n_rows * D2.n_cols;
+        blas::copy(size_d2, S2.buff, 1, A.data() + size_d1, 1);
+
+        RandBLAS::DenseSkOp<T> S_concat(Dfull, state);
+        RandBLAS::fill_dense(S_concat);
 
         for (int i = 0; i < size; i++) {
-            total += abs(A[i] - B[i]);
+            ASSERT_EQ(A[i], S_concat.buff[i]);
         }
-        
-        ASSERT_TRUE(total == 0.0);
     }
 
     template<typename RNG>
-    static void test_finalstate(
+    static void test_compute_next_state(
         uint32_t key,
         int64_t n_rows,
         int64_t n_cols,
         RandBLAS::DenseDistName dn
     ) {
-        int total = 0;
-        int *buff = new int[n_rows*n_cols];
-        auto state = RandBLAS::RNGState(key);
-        auto state_copy = RandBLAS::RNGState(key);
+        float *buff = new float[n_rows*n_cols];
+        RandBLAS::RNGState state(key);
 
         RandBLAS::DenseDist D(n_rows, n_cols, dn);
 
-        typename RNG::ctr_type c_ref = state_copy.counter;
-
-        auto [layout, final_state] = RandBLAS::fill_dense(D, buff, state);
-        auto c = final_state.counter;
-        int c_len = c.size();
+        auto actual_final_state = RandBLAS::fill_dense(D, buff, state);
+        auto actual_c = actual_final_state.counter;
 
-        int64_t pad = 0; //Pad computed such that  n_cols+pad is divisible by RNG::static_size
-        if (n_rows % RNG::ctr_type::static_size != 0) {
-            pad = RNG::ctr_type::static_size - n_rows % RNG::ctr_type::static_size;
-        }
-        int64_t last_ptr = n_rows-1 + (n_cols-1)*n_rows;
-        int64_t last_ptr_padded = last_ptr + last_ptr/n_rows * pad;
-        c_ref.incr(last_ptr_padded / RNG::ctr_type::static_size + 1); 
+        auto expect_final_state = RandBLAS::dense::compute_next_state(D, state);
+        auto expect_c = expect_final_state.counter;
 
-        for (int i = 0; i < c_len; i++) {
-            total += c[i] - c_ref[i];
+        for (int i = 0; i < RNG::ctr_type::static_size; i++) {
+            ASSERT_EQ(actual_c[i], expect_c[i]);
         }
 
-        ASSERT_TRUE(total == 0);
         delete [] buff;
     }
     
 };
 
-// For small matrix sizes, mean and stddev are not very close to desired vals.
-TEST_F(TestStateUpdate, Gaussian_var_gen)
-{
-    for (uint32_t key : {0, 1, 2}) {
-        auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_var_mat_gen<double>(key, 100, 50, dn);
-    }
-}
-
-TEST_F(TestStateUpdate, Gaussian_identity)
-{
+TEST_F(TestDenseSkOpStates, concat_tall_with_long_major_axis) {
     for (uint32_t key : {0, 1, 2}) {
         auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_identity<double>(key, 13, 7, dn);
-        test_identity<double>(key, 80, 40, dn);
-        test_identity<double>(key, 83, 41, dn);
-        test_identity<double>(key, 91, 43, dn);
-        test_identity<double>(key, 97, 47, dn);
+        test_concatenate_along_columns<double>(key, 13, 7, dn);
+        test_concatenate_along_columns<double>(key, 80, 40, dn);
+        test_concatenate_along_columns<double>(key, 83, 41, dn);
+        test_concatenate_along_columns<double>(key, 91, 43, dn);
+        test_concatenate_along_columns<double>(key, 97, 47, dn);
     }
 }
 
-TEST_F(TestStateUpdate, Final_State)
-{
+TEST_F(TestDenseSkOpStates, compare_skopless_fill_dense_to_compute_next_state) {
     for (uint32_t key : {0, 1, 2}) {
         auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_finalstate<r123::Philox4x32>(key, 13, 7, dn);
-        test_finalstate<r123::Philox4x32>(key, 11, 5, dn);
-        test_finalstate<r123::Philox4x32>(key, 131, 71, dn);
-        test_finalstate<r123::Philox4x32>(key, 80, 40, dn);
-        test_finalstate<r123::Philox4x32>(key, 91, 43, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 13, 7, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 11, 5, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 131, 71, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 80, 40, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 91, 43, dn);
     }
 }
diff --git a/test/test_datastructures/test_sparseskop.cc b/test/test_datastructures/test_sparseskop.cc
index 9843cb51..7709de52 100644
--- a/test/test_datastructures/test_sparseskop.cc
+++ b/test/test_datastructures/test_sparseskop.cc
@@ -77,7 +77,7 @@ class TestSparseSkOpConstruction : public ::testing::Test
 
     template <RandBLAS::SignedInteger sint_t>
     void proper_saso_construction(int64_t d, int64_t m, int64_t key_index, int64_t nnz_index) {
-        using RNG = RandBLAS::SparseSkOp<float>::RNG_t;
+        using RNG = RandBLAS::SparseSkOp<float>::state_t::generator;
         RandBLAS::SparseSkOp<float, RNG, sint_t> S0(
             {d, m, vec_nnzs[nnz_index], RandBLAS::MajorAxis::Short}, keys[key_index]
         );
@@ -91,7 +91,7 @@ class TestSparseSkOpConstruction : public ::testing::Test
 
     template <RandBLAS::SignedInteger sint_t>
     void proper_laso_construction(int64_t d, int64_t m, int64_t key_index, int64_t nnz_index) {
-        using RNG = RandBLAS::SparseSkOp<float>::RNG_t;
+        using RNG = RandBLAS::SparseSkOp<float>::state_t::generator;
         RandBLAS::SparseSkOp<float, RNG, sint_t> S0(
             {d, m, vec_nnzs[nnz_index], RandBLAS::MajorAxis::Long}, keys[key_index]
         );
diff --git a/test/test_datastructures/test_spmats/common.hh b/test/test_datastructures/test_spmats/common.hh
index 8a7ffa37..c262822e 100644
--- a/test/test_datastructures/test_spmats/common.hh
+++ b/test/test_datastructures/test_spmats/common.hh
@@ -57,7 +57,7 @@ void iid_sparsify_random_dense(
 ) { 
     auto spar = new T[n_rows * n_cols];
     auto dist = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
-    auto [unused, next_state] = RandBLAS::fill_dense(dist, spar, state);
+    auto next_state = RandBLAS::fill_dense(dist, spar, state);
 
     auto temp = new T[n_rows * n_cols];
     auto D_mat = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
new file mode 100644
index 00000000..3e1aa2eb
--- /dev/null
+++ b/test/test_handrolled_lapack.cc
@@ -0,0 +1,357 @@
+
+
+#include <cmath>
+#include <blas.hh>
+
+#include "RandBLAS/util.hh"
+#include "handrolled_lapack.hh"
+#include "test_basic_rng/rng_common.hh"
+#include "comparison.hh"
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+using RandBLAS::DenseDist;
+using RandBLAS::DenseDistName;
+using RandBLAS::RNGState;
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+// MARK: Cholesky
+
+class TestHandrolledCholesky : public ::testing::Test {
+    protected:
+
+    template <typename T, typename FUNC>
+    void run_factor_gram_matrix(int n, FUNC &cholfunc, uint32_t key) {
+        auto layout = blas::Layout::ColMajor;
+        int64_t m = 2*n;
+        DenseDist D(m, n);
+        std::vector<T> A(n*n);
+        std::vector<T> B(m*n);
+        T iso_scale = std::pow(RandBLAS::isometry_scale_factor<T>(D), 2);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, B.data(), state);
+        std::vector<T> C(B);
+
+        // define positive definite A
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, m, iso_scale, B.data(), m, 0.0, A.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, A.data(), n);
+        // overwrite A by its upper-triangular cholesky factor
+        cholfunc(n, A.data());
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Lower, n, 1, (T) 0.0, A.data(), n);
+
+        // compute the gram matrix of A's cholesky factor
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, n, 1.0, A.data(), n, 0.0, B.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, B.data(), n);
+        // recompute A
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, m, iso_scale, C.data(), m, 0.0, A.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, A.data(), n);
+
+        test::comparison::matrices_approx_equal(layout, blas::Op::NoTrans, n, n, B.data(), n, A.data(), n, 
+            __PRETTY_FUNCTION__, __FILE__, __LINE__
+        );
+    }
+
+    template <typename T>
+    void run_sequential_factor_gram_matrix(int n, uint32_t key) {
+        auto cholfunc = [](int64_t n, T* A) { hr_lapack::potrf_upper_sequential(n, A, n); };
+        run_factor_gram_matrix<T>(n, cholfunc, key);
+    }
+
+    template <typename T>
+    void run_blocked_factor_gram_matrix(int n, int b, uint32_t key) {
+        auto cholfunc = [b](int64_t n, T* A) { hr_lapack::potrf_upper(n, A, n, b); };
+        run_factor_gram_matrix<T>(n, cholfunc, key);
+    }
+
+    template <typename T>
+    void run_blocked_factor_diagonal(int n, int b) {
+        auto layout = blas::Layout::ColMajor;
+        std::vector<T> A(n*n, 0.0);
+        std::vector<T> B(n*n);
+        for (int i = 0; i < n; ++i) {
+            A[i+i*n] = 1.0/((T) i + 1.0);
+            B[i+i*n] = std::sqrt(A[i+i*n]);
+        }
+        hr_lapack::potrf_upper(n, A.data(), n, b);
+        test::comparison::matrices_approx_equal(layout, blas::Op::NoTrans, n, n, B.data(), n, A.data(), n, 
+            __PRETTY_FUNCTION__, __FILE__, __LINE__
+        );
+    }
+
+};
+
+TEST_F(TestHandrolledCholesky, sequential_random_gram_matrix) {
+    for (int i = 1; i <= 64; ++i) {
+        run_sequential_factor_gram_matrix<float>(i, 0);
+    }
+    for (int i = 1; i <= 64; ++i) {
+        run_sequential_factor_gram_matrix<float>(i, 1);
+    }
+}
+
+TEST_F(TestHandrolledCholesky, blocked_diagonal) {
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_diagonal<float>(i, 2);
+    }
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_diagonal<float>(i, 3);
+    }
+    for (int i = 2; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i, i/2);
+    }
+    for (int i = 16; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i, i-7);
+    }
+    for (int i = 16; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i-3, i/4);
+    }
+}
+
+TEST_F(TestHandrolledCholesky, blocked_random_gram_matrix) {
+    for (int i = 2; i <= 16; i+=2) {
+        run_blocked_factor_gram_matrix<float>(i, 1, 0);
+    }
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_gram_matrix<float>(i, 2, 0);
+    }
+    for (int i = 2; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i, i/2, 0);
+    }
+    for (int i = 16; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i, i-5, 0);
+    }
+    for (int i = 16; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i-5, 7, 0);
+    }
+}
+
+
+
+template <typename T>
+void verify_product(int64_t m, int64_t n, int64_t k, const T* A, const T* B, const T* C) {
+    using std::vector;
+    int64_t lda = m;
+    int64_t ldb = k;
+    int64_t ldc = m;
+    vector<T> C_work(m*n, 0.0);
+    blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, (T) 1.0, A, lda, B, ldb, (T) 0.0, C_work.data(), ldc);
+
+    vector<T> A_copy(m*k, 0.0);
+    vector<T> B_copy(k*n, 0.0);
+    blas::copy(m*k, A, 1, A_copy.data(), 1);
+    blas::copy(k*n, B, 1, B_copy.data(), 1);
+    std::for_each(A_copy.begin(), A_copy.end(), [](T &val) {val = std::abs(val); });
+    std::for_each(B_copy.begin(), B_copy.end(), [](T &val) {val = std::abs(val); });
+    vector<T> E(m*n, 0.0);
+    T err_alpha = 2 * k * std::numeric_limits<T>::epsilon();
+    blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, err_alpha, A_copy.data(), lda, B_copy.data(), ldb, (T)0.0, E.data(), m);
+    test::comparison::buffs_approx_equal(C_work.data(), C, E.data(), m*n, __PRETTY_FUNCTION__, __FILE__, __LINE__);
+    return;
+}
+
+template <typename T>
+void verify_orthonormal_componentwise(int m, int n, T* Q, T max_cond = 10) {
+    std::vector<T> I(n*n, 0.0);
+    for (int i = 0; i < n; ++i)
+        I[i + i*n] = 1.0;
+    std::vector<T> QtQ(n*n, 0.0);
+    T tol = max_cond * std::sqrt(m) * std::numeric_limits<T>::epsilon();
+    blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, n, n, m, (T) 1.0, Q, m, Q, m, (T) 0.0, QtQ.data(), n);
+    test::comparison::matrices_approx_equal(blas::Layout::ColMajor, blas::Op::NoTrans, n, n, QtQ.data(), n, I.data(), n,
+        __PRETTY_FUNCTION__, __FILE__, __LINE__, tol, tol
+    );
+    return;
+}
+
+// MARK: QR
+
+class TestHandrolledQR : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_cholqr_gaussian(int m, int n, int b, uint32_t key) {
+        DenseDist D(m, n, DenseDistName::Gaussian);
+        std::vector<T> A(m*n);
+        T iso_scale = RandBLAS::isometry_scale_factor<T>(D);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, A.data(), state);
+        blas::scal(m*n, iso_scale, A.data(), 1);
+
+        std::vector<T> Q(A);
+        std::vector<T> R(2*n*n);
+        hr_lapack::chol_qr(m, n, Q.data(), R.data(), b, true);
+        verify_orthonormal_componentwise(m, n, Q.data());
+        verify_product(m, n, n, Q.data(), R.data(), A.data());
+    }
+
+    template <typename T>
+    void run_qr_blocked_cgs(int m, int n, int b, uint32_t key) {
+        DenseDist D(m, n, DenseDistName::Gaussian);
+        std::vector<T> A(m*n);
+        T iso_scale = RandBLAS::isometry_scale_factor<T>(D);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, A.data(), state);
+        blas::scal(m*n, iso_scale, A.data(), 1);
+
+        std::vector<T> Q(A);
+        std::vector<T> R(n*n);
+        std::vector<T> work{};
+        hr_lapack::qr_block_cgs2(m, n, Q.data(), R.data(), work, b);
+        verify_orthonormal_componentwise(m, n, Q.data());
+        verify_product(m, n, n, Q.data(), R.data(), A.data());
+    }
+
+};
+
+TEST_F(TestHandrolledQR, cholqr_small) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_cholqr_gaussian<float>(10, 1, 5, key);
+        run_cholqr_gaussian<float>(2,  2, 5, key);
+        run_cholqr_gaussian<float>(10, 6, 5, key);
+
+        run_cholqr_gaussian<float>(10, 1, 2, key);
+        run_cholqr_gaussian<float>(10, 2, 2, key);
+        run_cholqr_gaussian<float>(10, 6, 2, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, cholqr_medium) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_cholqr_gaussian<float>(1000, 100, 5,  key);
+        run_cholqr_gaussian<float>(1024, 128, 64, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, blocked_cgs_small) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_qr_blocked_cgs<float>(10,  5, 2, key);
+        run_qr_blocked_cgs<float>(10,  7, 2, key);
+        run_qr_blocked_cgs<float>(10, 10, 2, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, blocked_cgs_medium) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_qr_blocked_cgs<float>(1000, 1000, 64, key);
+        run_qr_blocked_cgs<float>(1024, 1024, 64, key);
+    }
+}
+
+// MARK: Eigenvalues
+
+template <typename T>
+std::vector<T> posdef_with_random_eigvecs(std::vector<T> &eigvals, uint32_t key) {
+    int64_t n = eigvals.size();
+    for (auto ev : eigvals) 
+        randblas_require(ev > 0);
+    std::vector<T> work0(n*n, 0.0);
+    T* work0_buff = work0.data();
+    DenseDist distn(n, n, DenseDistName::Gaussian);
+    RNGState state(key);
+    RandBLAS::fill_dense(distn, work0_buff, state);
+    std::vector<T> work1(n*n, 0.0);
+    std::vector<T> work2{};
+    hr_lapack::qr_block_cgs2(n, n, work0_buff, work1.data(), work2);
+    for (int i = 0; i < n; ++i)
+        blas::scal(n, std::sqrt(eigvals[i]), work0_buff + i*n, 1);
+    std::vector<T> out(n*n, 0.0);
+    blas::syrk(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, n, n, (T)1.0, work0_buff, n, (T)0.0, out.data(), n);
+    RandBLAS::util::symmetrize(blas::Layout::ColMajor, blas::Uplo::Upper, n, out.data(), n);
+    return out;
+}
+
+class TestHandrolledEigvals : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_diag(int n, int b) {
+        std::vector<T> A(n*n, 0.0);
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 1.0 / (1 + (T)i);
+            A[i + i*n] = eigvals_expect[i];
+        }
+        std::vector<T> eigvals_actual(n);
+        int64_t iters = 1;
+        T tol = 1e-3;
+        auto iter = hr_lapack::posdef_eig_chol_iteration(n, A.data(), eigvals_actual.data(), tol, iters, b);
+        ASSERT_EQ(iter, 0);
+        test::comparison::buffs_approx_equal(eigvals_actual.data(), eigvals_expect.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+
+    template <typename T>
+    void run_general_posdef(int n, int b, uint32_t key) {
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 2.0 / (1 + (T)i);
+        }
+        auto A = posdef_with_random_eigvecs(eigvals_expect, key);
+        std::vector<T> eigvals_actual(n);
+        int64_t iters = 1000;
+        T tol = 1e-3;
+        auto iter = hr_lapack::posdef_eig_chol_iteration(n, A.data(), eigvals_actual.data(), tol, iters, b);
+        std::cout << "Number of iterations  : " << iter << std::endl;
+        T min_eig_actual = *std::min_element(eigvals_actual.begin(), eigvals_actual.end());
+        T max_eig_actual = *std::max_element(eigvals_actual.begin(), eigvals_actual.end());
+        std::cout << "min_comp / min_actual " << min_eig_actual / eigvals_expect[n-1] << std::endl;
+        std::cout << "max_comp / max_actual " << max_eig_actual / eigvals_expect[0] << std::endl;
+        test::comparison::approx_equal(min_eig_actual, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        test::comparison::approx_equal(max_eig_actual, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+
+    template <typename T>
+    void run_power_general_posdef(int n, uint32_t key) {
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 2.0 / std::sqrt((1 + (T)i));
+        }
+        auto A = posdef_with_random_eigvecs(eigvals_expect, key);
+
+        std::vector<T> ourwork(2*n, 0.0);
+        std::vector<T> subwork{};
+        T tol = 1e-3;
+        RNGState state(key + 1);
+        T p_fail = (n < 500) ? (T) 1e-6 : (T) 0.5;
+        // ^ This affects the number of iterations that will be used in the power method. That number is
+        //   max{#iterations to succeed in expectation, #iterations to succeed with some probability}.
+        //   Large values of p_fail (like p_fail = 0.5) just say "succeed in expectation."
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, p_fail, state, subwork);
+
+        std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
+        std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;
+        test::comparison::approx_equal(lambda_min, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        test::comparison::approx_equal(lambda_max, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+};
+
+TEST_F(TestHandrolledEigvals, power_smallish) {
+    run_power_general_posdef<double>(10, 0);
+    run_power_general_posdef<double>(50, 0);
+    run_power_general_posdef<double>(100, 0);
+}
+
+TEST_F(TestHandrolledEigvals, diag) {
+    run_diag<double>(5, 1);
+    run_diag<double>(10, 1);
+    run_diag<double>(100, 1);
+}
+
+TEST_F(TestHandrolledEigvals, power_medium) {
+    run_power_general_posdef<double>( 512, 0 );
+    run_power_general_posdef<double>( 512, 1 );
+}
+
+TEST_F(TestHandrolledEigvals, power_largeish) {
+    run_power_general_posdef<float>( 1024, 0 );
+    run_power_general_posdef<float>( 1024, 1 );
+}
+
+
diff --git a/test/test_matmul_cores/linop_common.hh b/test/test_matmul_cores/linop_common.hh
index e07c1888..82def271 100644
--- a/test/test_matmul_cores/linop_common.hh
+++ b/test/test_matmul_cores/linop_common.hh
@@ -72,8 +72,8 @@ template <typename T, typename RNG=r123::Philox4x32>
 auto random_matrix(int64_t m, int64_t n, RNGState<RNG> s) {
     std::vector<T> A(m * n);
     DenseDist DA(m, n);
-    auto [layout, next_state] = RandBLAS::fill_dense(DA, A.data(), s);
-    std::tuple<std::vector<T>, Layout, RNGState<RNG>> t{A, layout, next_state};
+    auto next_state = RandBLAS::fill_dense(DA, A.data(), s);
+    std::tuple<std::vector<T>, Layout, RNGState<RNG>> t{A, RandBLAS::dist_to_layout(DA), next_state};
     return t;
 }
 
diff --git a/test/test_matmul_cores/test_lskge3.cc b/test/test_matmul_cores/test_lskge3.cc
index 921dad05..8384ad21 100644
--- a/test/test_matmul_cores/test_lskge3.cc
+++ b/test/test_matmul_cores/test_lskge3.cc
@@ -51,7 +51,7 @@ class TestLSKGE3 : public ::testing::Test
         blas::Layout layout
     ) {
         DenseDist D(d, m);
-        DenseSkOp<T> S0(D, seed, nullptr);
+        DenseSkOp<T> S0(D, seed);
         if (preallocate)
             RandBLAS::fill_dense(S0);
         test_left_apply_submatrix_to_eye<T>(1.0, S0, d, m, 0, 0, layout, 0.0);
@@ -65,7 +65,7 @@ class TestLSKGE3 : public ::testing::Test
         blas::Layout layout
     ) {
         DenseDist Dt(m, d);
-        DenseSkOp<T> S0(Dt, seed, nullptr);
+        DenseSkOp<T> S0(Dt, seed);
         RandBLAS::fill_dense(S0);
         test_left_apply_transpose_to_eye<T>(S0, layout);
     }
@@ -103,7 +103,7 @@ class TestLSKGE3 : public ::testing::Test
         randblas_require(m0 > m);
         randblas_require(n0 > n);
         DenseDist D(d, m);
-        DenseSkOp<T> S0(D, seed_S0, nullptr);
+        DenseSkOp<T> S0(D, seed_S0);
         test_left_apply_to_submatrix<T>(S0, n, m0, n0, A_ro, A_co, layout);
     }
 
diff --git a/test/test_matmul_cores/test_rskge3.cc b/test/test_matmul_cores/test_rskge3.cc
index 2bd8f949..118a0276 100644
--- a/test/test_matmul_cores/test_rskge3.cc
+++ b/test/test_matmul_cores/test_rskge3.cc
@@ -52,7 +52,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist D(m, d);
-        DenseSkOp<T> S0(D, seed, nullptr);
+        DenseSkOp<T> S0(D, seed);
         if (preallocate)
             RandBLAS::fill_dense(S0);
         test_right_apply_submatrix_to_eye<T>(1.0, S0, m, d, 0, 0, layout, 0.0, 0);
@@ -66,7 +66,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist Dt(d, m);
-        DenseSkOp<T> S0(Dt, seed, nullptr);
+        DenseSkOp<T> S0(Dt, seed);
         test_right_apply_tranpose_to_eye<T>(S0, layout);
     }
 
@@ -99,7 +99,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist D(n, d);
-        DenseSkOp<T> S0(D, seed_S0, nullptr);
+        DenseSkOp<T> S0(D, seed_S0);
         test_right_apply_to_submatrix<T>(S0, m, m0, n0, A_ro, A_co, layout);
     }
 
diff --git a/test/test_matmul_wrappers/test_sketch_symmetric.cc b/test/test_matmul_wrappers/test_sketch_symmetric.cc
index 0b48fc01..47594f2a 100644
--- a/test/test_matmul_wrappers/test_sketch_symmetric.cc
+++ b/test/test_matmul_wrappers/test_sketch_symmetric.cc
@@ -53,8 +53,8 @@ void random_symmetric_mat(int64_t n, T* A, int64_t lda, STATE s) {
     // This function can be interpreted as first generating a random lda-by-lda symmetric matrix
     // whose entries in the upper triangle are iid, then symmetrizing that matrix, then
     // zeroing out all entries outside the leading principal submatrix of order n.
-    RandBLAS::fill_dense({lda, lda}, n, n, 0, 0, A, s);
-    RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, A, n, lda);
+    RandBLAS::fill_dense(Layout::ColMajor, {lda, lda}, n, n, 0, 0, A, s);
+    RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, A, lda);
     return;
 }