From 81931a91ab2b35f93f64837d6f1c682779b5a7ab Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 26 Jul 2024 09:01:31 -0400
Subject: [PATCH 01/37] explicitly remove support for CBRNGs based on AES or
 ARS. Refactor of test_r123_kat.cc to be readable.

---
 RandBLAS/base.hh                     |   3 +-
 RandBLAS/random_gen.hh               |   3 +-
 test/DevNotes.md                     |  17 +-
 test/test_basic_rng/kat_vectors.txt  |   2 +-
 test/test_basic_rng/r123_rngNxW.mm   |   4 -
 test/test_basic_rng/test_r123_kat.cc | 492 +++++++++++++--------------
 6 files changed, 256 insertions(+), 265 deletions(-)
diff --git a/RandBLAS/base.hh b/RandBLAS/base.hh
index a2382e80..e372d002 100644
--- a/RandBLAS/base.hh
+++ b/RandBLAS/base.hh
@@ -135,8 +135,7 @@ enum class MajorAxis : char {
  * (typically of length 2 or 4), and can be distinct from one another.
  * 
  * @tparam RNG A CBRNG type in defined in Random123. We've found that Philox-based
- * CBRNGs work best for our purposes. Strictly speaking, we allow all Random123 CBRNGs
- * besides those based on AES.
+ * CBRNGs work best for our purposes, but we also support Threefry-based CBRNGS.
  */
 template <typename RNG = r123::Philox4x32>
 struct RNGState
diff --git a/RandBLAS/random_gen.hh b/RandBLAS/random_gen.hh
index d7dcf127..714da89c 100644
--- a/RandBLAS/random_gen.hh
+++ b/RandBLAS/random_gen.hh
@@ -73,8 +73,7 @@ static inline void sincospi(double x, double *s, double *c) {
 #include <Random123/array.h>
 #include <Random123/philox.h>
 #include <Random123/threefry.h>
-#include <Random123/ars.h>
-// NOTE: we do not support Random123's AES generator.
+// NOTE: we do not support Random123's AES or ARS generators.
 #include <Random123/boxmuller.hpp>
 #include <Random123/uniform.hpp>
 
diff --git a/test/DevNotes.md b/test/DevNotes.md
index 6a2a1753..ed1ff934 100644
--- a/test/DevNotes.md
+++ b/test/DevNotes.md
@@ -4,7 +4,7 @@
 This document doesn't don't defend previous design decisions.
 It just explains how things work right now.
 That's easier for me (Riley) to write, and it's more useful to others.
-(Plus, a good description of what we do will make pros and cons of the current approach self-evident.)
+(Plus, it helps make the pros and cons of the current approach self-evident.)
 
 None of our testing infrastructure is considered part of the public API.
 
@@ -28,10 +28,19 @@ None of our testing infrastructure is considered part of the public API.
 I suspect that the tests for rskgex and right_spmm hit code paths that are currently untested,
 but I haven't actually verified this. 
 
-### next_folder ... 
+### test_basic_rng
 
-## Next topic ....
+  * test_r123_kat.cc has deterministic tests for Random123. The tests comapre generated values
+    to reference values computed ahead of time. The tests are __extremely__ messy, since they're
+    adapted from tests in the official Random123 repository, and Random123 needs to handle a far wider
+    range of compilers and languages than we assume for RandBLAS.
 
+  * test_sample_indices.cc includes statistical tests for sampling from an index set with or without
+    replacement. 
+
+  * rng_common.hh includes data for statistical tables (e.g., for Kolmogorov-Smirnov tests) and helper
+    functions to compute quantities associated with certain probability distributions (e.g., mean
+    and variance of the hypergeometric distribution).
 
 
 # OLD
@@ -65,7 +74,7 @@ Specifics:
     easy enough to have it reduce directly to GEMM, and reducing 
     directly to GEMM had the advantage of improved readibility. We don't
     test all possible combinations of flags (we omit when both arguments
-    are transposed) but the combinationed we leave untested are unrelated
+    are transposed) but the combination we leave untested are unrelated
     to flow-of-control.
 
     RSKGES reduces to right_spmm, which does indeed fall back on
diff --git a/test/test_basic_rng/kat_vectors.txt b/test/test_basic_rng/kat_vectors.txt
index 21c453dc..9398f32c 100644
--- a/test/test_basic_rng/kat_vectors.txt
+++ b/test/test_basic_rng/kat_vectors.txt
@@ -68,7 +68,7 @@ threefry4x64 13 0000000000000000 0000000000000000 0000000000000000 0000000000000
 threefry4x64 13 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 7eaed935479722b5 90994358c429f31c 496381083e07a75b 627ed0d746821121
 threefry4x64 13 243f6a8885a308d3 13198a2e03707344 a4093822299f31d0 082efa98ec4e6c89 452821e638d01377 be5466cf34e90c6c c0ac29b7c97c50dd 3f84d5b5b5470917 4361288ef9c1900c 8717291521782833 0d19db18c20cf47e a0b41d63ac8581e5
 threefry4x64 20 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000   09218ebde6c85537 55941f5266d86105 4bd25e16282434dc ee29ec846bd2e40b
- threefry4x64 20 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 29c24097942bba1b 0371bbfb0f6f4e11 3c231ffa33f83a1c cd29113fde32d168
+threefry4x64 20 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 29c24097942bba1b 0371bbfb0f6f4e11 3c231ffa33f83a1c cd29113fde32d168
 threefry4x64 20 243f6a8885a308d3 13198a2e03707344 a4093822299f31d0 082efa98ec4e6c89 452821e638d01377 be5466cf34e90c6c be5466cf34e90c6c c0ac29b7c97c50dd   a7e8fde591651bd9 baafd0c30138319b 84a5c1a729e685b9 901d406ccebc1ba4
 threefry4x64 72 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 94eeea8b1f2ada84 adf103313eae6670 952419a1f4b16d53 d83f13e63c9f6b11
 threefry4x64 72 ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff 11518c034bc1ff4c 193f10b8bcdcc9f7 d024229cb58f20d8 563ed6e48e05183f
diff --git a/test/test_basic_rng/r123_rngNxW.mm b/test/test_basic_rng/r123_rngNxW.mm
index 5062522d..9112b11d 100644
--- a/test/test_basic_rng/r123_rngNxW.mm
+++ b/test/test_basic_rng/r123_rngNxW.mm
@@ -51,7 +51,3 @@
 RNGNxW_TPL(threefry, 2, 64)
 RNGNxW_TPL(threefry, 4, 64)
 #endif
-#if R123_USE_AES_NI
-RNGNxW_TPL(ars, 4, 32)
-RNGNxW_TPL(aesni, 4, 32)
-#endif
diff --git a/test/test_basic_rng/test_r123_kat.cc b/test/test_basic_rng/test_r123_kat.cc
index 14e6517e..4487f732 100644
--- a/test/test_basic_rng/test_r123_kat.cc
+++ b/test/test_basic_rng/test_r123_kat.cc
@@ -31,10 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <gtest/gtest.h>
-#include <Random123/philox.h>
-#include <Random123/threefry.h>
-#include <Random123/ars.h>
-#include <Random123/aes.h>
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -42,9 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <errno.h>
 #include <assert.h>
 #include <math.h>
-#include <Random123/features/compilerfeatures.h>
-#include <sstream>
 
+#include <sstream>
 #include <iostream>
 #include <iomanip>
 #include <cassert>
@@ -52,21 +47,34 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cassert>
 #include <system_error>
 #include <cstdint>
-#include <Random123/uniform.hpp>
 #include <map>
 #include <string>
 
+#include <cstring>
+#include <utility>
+#include <stdexcept>
+
+#include <Random123/philox.h>
+#include <Random123/threefry.h>
+#include <Random123/uniform.hpp>
+#include <Random123/features/compilerfeatures.h>
+#include <Random123/MicroURNG.hpp>
+#include <Random123/conventional/Engine.hpp>
 
-#define LINESIZE 1024
 
-#if R123_USE_AES_NI
-    int have_aesni = haveAESNI();
-#else
-    int have_aesni = 0;
+#define LINESIZE 1024
+#ifdef _MSC_FULL_VER
+#define strtoull _strtoui64
+// ^ Needed to define the strtou32 and strtou64 functions.
+#pragma warning (disable : 4521)
+// ^ Engines have multiple copy constructors, quite legal C++, disable MSVC complaint
 #endif
+
 int verbose = 0;
 int debug = 0;
 
+// MARK: I/O and conversions
+
 /* strdup may or may not be in string.h, depending on the value
    of the pp-symbol _XOPEN_SOURCE and other arcana.  Just
    do it ourselves.
@@ -84,9 +92,6 @@ char *ntcsdup(const char *s){
 // Specifically, they strip leading whitespace, and then they stop reading as
 // soon as they reach a non-numeric character. (Note that the "a" in 257a3673
 // counts as a numeric character if we're reading in hexadecimal format.)
-#ifdef _MSC_FULL_VER
-#define strtoull _strtoui64
-#endif
 uint32_t strtou32(const char *p, char **endp, int base){
     uint32_t ret;
     errno = 0;
@@ -108,8 +113,6 @@ void prtu(std::ostream& os, T val) {
     os << std::hex << std::setw(std::numeric_limits<T>::digits / 4) << std::setfill('0') << val;
     assert(!os.bad());
 }
-
-// Specializations for uint32_t and uint64_t
 void prtu32(std::ostream& os, uint32_t v) { prtu(os, v); }
 void prtu64(std::ostream& os, uint64_t v) { prtu(os, v); }
 
@@ -136,6 +139,23 @@ do { \
     fflush(fp); \
 } while(0)
 
+// MARK: code generation
+//
+// The code from here until the next MARK uses a compiler directive
+// metaprogramming technique to define a bunch of similarly named
+// identifiers. The idenfiers are ...
+//
+//      method_e::<GEN>NxW_e        (enum members)
+//      <GEN>NxW_kat                (structs)
+//      kat_instance.<GEN>NxW_data  (members of type <GEN>NxW_kat)
+//      read_<GEN>NxW               (functions)
+//      report_<GEN>NxWerror        (functions)
+//
+// This code pattern was needed when the tests had to compile as 
+// valid C programs. We've changed style very slightly to reflect
+// our specialization to C++.
+//
+
 enum method_e{
 #define RNGNxW_TPL(base, N, W) base##N##x##W##_e,
 #include "r123_rngNxW.mm"
@@ -144,56 +164,28 @@ enum method_e{
 };
 
 #define RNGNxW_TPL(base, N, W)                       \
-    typedef struct {                                 \
+    struct base##N##x##W##_kat {                     \
         base##N##x##W##_ctr_t ctr;                   \
         base##N##x##W##_ukey_t ukey;                 \
         base##N##x##W##_ctr_t expected;              \
         base##N##x##W##_ctr_t computed;              \
-    } base##N##x##W##_kat;
+    };
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-typedef struct{
+struct kat_instance {
     enum method_e method;
     unsigned nrounds;
     union{
 #define RNGNxW_TPL(base, N, W) base##N##x##W##_kat base##N##x##W##_data;
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
-	/* Sigh... For those platforms that lack uint64_t, carve
-	   out 128 bytes for the counter, key, expected, and computed. */
+	// Sigh... For those platforms that lack uint64_t, carve
+	//   out 128 bytes for the counter, key, expected, and computed.
 	char justbytes[128]; 
     }u;
-} kat_instance;
-
-void host_execute_tests(kat_instance *tests, unsigned ntests);
-                
-/* Keep track of the test vectors that we don't know how to deal with: */
-#define MAXUNKNOWNS 20
-
-struct UnknownKatTracker {
-    int num_unknowns = 0;
-    const char *unknown_names[MAXUNKNOWNS];
-    int unknown_counts[MAXUNKNOWNS];
 };
 
-void register_unknown(UnknownKatTracker &ukt, const char *name){
-    int i;
-    for(i=0; i< ukt.num_unknowns; ++i){
-        if( strcmp(name, ukt.unknown_names[i]) == 0 ){
-            ukt.unknown_counts[i]++;
-            return;
-        }
-    }
-    if( i >= MAXUNKNOWNS ){
-        FAIL() << "Too many unknown rng types. Bye.\n";
-    }
-    ukt.num_unknowns++;
-    ukt.unknown_names[i] = ntcsdup(name);
-    ukt.unknown_counts[i] = 1;
-}
-
-/* read_<GEN>NxW */
 #define RNGNxW_TPL(base, N, W) \
 int read_##base##N##x##W(const char *line, kat_instance* tinst){        \
     size_t i;                                                           \
@@ -223,29 +215,6 @@ int read_##base##N##x##W(const char *line, kat_instance* tinst){        \
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-
-/* readtest:  dispatch to one of the read_<GEN>NxW functions */
-static int readtest(UnknownKatTracker &ukt, const char *line, kat_instance* tinst){
-    int nchar;
-    char name[LINESIZE];
-    if( line[0] == '#') return 0;                                       
-    sscanf(line, "%s%n", name, &nchar);
-    if(!have_aesni){
-        /* skip any tests that require AESNI */ 
-        if(strncmp(name, "aes", 3)==0 ||
-           strncmp(name, "ars", 3)==0){
-            register_unknown(ukt, name);
-            return 0;
-        }
-    }
-#define RNGNxW_TPL(base, N, W) if(strcmp(name, #base #N "x" #W) == 0) return read_##base##N##x##W(line+nchar, tinst);
-#include "r123_rngNxW.mm"
-#undef RNGNxW_TPL
-
-    register_unknown(ukt, name);
-    return 0;
-}
-
 #define RNGNxW_TPL(base, N, W) \
 void report_##base##N##x##W##error(int &nfailed, const kat_instance *ti){ \
  size_t i;                                                     \
@@ -282,134 +251,67 @@ void report_##base##N##x##W##error(int &nfailed, const kat_instance *ti){ \
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-// dispatch to one of the report_<GEN>NxW() functions
-void analyze_tests(int &nfailed, const kat_instance *tests, unsigned ntests){
-    unsigned i;
-    char zeros[512] = {0};
-    for(i=0; i<ntests; ++i){
-        const kat_instance *ti = &tests[i];
-        switch(tests[i].method){
-#define RNGNxW_TPL(base, N, W) case base##N##x##W##_e: \
-            if (memcmp(zeros, ti->u.base##N##x##W##_data.expected.v, N*W/8)==0){ \
-                FAIL() << "kat expected all zeros?   Something is wrong with the test harness!\n"; \
-                nfailed++; \
-            } \
-            if (memcmp(ti->u.base##N##x##W##_data.computed.v, ti->u.base##N##x##W##_data.expected.v, N*W/8)) \
-		report_##base##N##x##W##error(nfailed, ti); \
-	    break;
-#include "r123_rngNxW.mm"
-#undef RNGNxW_TPL
-        case last: ;
-        }
-    }
-}
-
-#define NTESTS 1000
-
-void run_base_rng_kat() {
-    kat_instance *tests;
-    unsigned t, ntests = NTESTS;
-    char linebuf[LINESIZE];
-    FILE *inpfile;
-    const char *p;
-    const char *inname;
-    int nfailed = 0;
-
-    UnknownKatTracker ukt{};
-
-    inname = "./kat_vectors.txt";
-	inpfile = fopen(inname, "r");
-	if (inpfile == NULL)
-        FAIL() << "Error opening input file " << inname << " for reading. Received error code " << errno << "\n";
+// MARK: Base generator test
+//
+//  Helper functions here are modified from the original Random123 tests
+//  follow the "arrange, act, assert" pattern of unit testing. 
+//
 
-    if ((p = getenv("KATC_VERBOSE")) != NULL)
-	    verbose = atoi(p);
-  
-    if ((p = getenv("KATC_DEBUG")) != NULL)
-	    debug = atoi(p);
+struct UnknownKatTracker {
+    const static int MAXUNKNOWNS = 20;
+    int num_unknowns = 0;
+    const char *unknown_names[MAXUNKNOWNS];
+    int unknown_counts[MAXUNKNOWNS];
+};
 
-    tests = (kat_instance *) malloc(sizeof(tests[0])*ntests);
-    if (tests == NULL) {
-        FAIL() << "Could not allocate " << (unsigned long) ntests << " bytes for tests\n";
-    }
-    t = 0;
-    while (fgets(linebuf, sizeof linebuf, inpfile) != NULL) {
-        if( t == ntests ) {
-            ntests *= 2;
-            tests = (kat_instance *)realloc(tests, sizeof(tests[0])*ntests);
-            if (tests == NULL) {
-                FAIL() << "Could not grow tests to " << (unsigned long) ntests << " bytes.\n";
-            }
+void register_unknown(UnknownKatTracker &ukt, const char *name){
+    int i;
+    for(i=0; i< ukt.num_unknowns; ++i){
+        if( strcmp(name, ukt.unknown_names[i]) == 0 ){
+            ukt.unknown_counts[i]++;
+            return;
         }
-        if( readtest(ukt, linebuf, &tests[t]) )
-            ++t;
     }
-    if(t==ntests){
-	    FAIL() << "No more space for tests?  Recompile with a larger NTESTS\n";
+    if( i >= ukt.MAXUNKNOWNS ){
+        FAIL() << "Too many unknown rng types. Bye.\n";
     }
-    tests[t].method = last; // N.B  *not* t++ - the 'ntests' value passed to host_execute_tests does not count the 'last' one.
+    ukt.num_unknowns++;
+    ukt.unknown_names[i] = ntcsdup(name);
+    ukt.unknown_counts[i] = 1;
+}
 
-    for(int i=0; i< ukt.num_unknowns; ++i){
-        printf("%d test vectors of type %s skipped\n", ukt.unknown_counts[i], ukt.unknown_names[i]);
+void base_rng_test_arrange(const char *line, kat_instance* tinst, UnknownKatTracker &ukt, bool &flag){
+    int nchar;
+    char name[LINESIZE];
+    if( line[0] == '#') {
+        flag = false;
+        return;
+    }                                       
+    sscanf(line, "%s%n", name, &nchar);
+    /* skip any tests that require AESNI */ 
+    if(strncmp(name, "aes", 3)==0 || strncmp(name, "ars", 3)==0){
+        register_unknown(ukt, name);
+        flag = false;
+        return;
     }
-    printf("Perform %lu tests.\n", (unsigned long)t);
-    host_execute_tests(tests, t);
+#define RNGNxW_TPL(base, N, W) \
+    if(strcmp(name, #base #N "x" #W) == 0)  {                    \
+        flag = (bool) read_##base##N##x##W(line+nchar, tinst);   \
+        return;                                                  \
+    }                                                            
+#include "r123_rngNxW.mm"
+#undef RNGNxW_TPL
 
-    analyze_tests(nfailed, tests, t);
-    free(tests);
-    if(nfailed != 0)
-        FAIL() << "Failed " << nfailed << " out of " << t << std::endl;
+    register_unknown(ukt, name);
+    flag = false;
     return;
 }
 
-
-
-// With C++, it's a little trickier to create the mapping from
-// method-name/round-count to functions
-// because the round-counts are template arguments that have to be
-// specified at compile-time.  Thus, we can't just do #define RNGNxW_TPL
-// and #include "r123_rngNxW.mm".  We have to build a static map from:
-//  pair<generator, rounds> to functions that apply the right generator
-// with the right number of rounds.
-
-#ifdef _MSC_FULL_VER
-// Engines have multiple copy constructors, quite legal C++, disable MSVC complaint
-#pragma warning (disable : 4521)
-#endif
-
-#include <map>
-#include <cstring>
-#include <utility>
-#include <stdexcept>
-#include <Random123/MicroURNG.hpp>
-#include <Random123/conventional/Engine.hpp>
-
-using namespace std;
-
-typedef map<pair<method_e, unsigned>, void (*)(kat_instance *)> genmap_t;
-genmap_t genmap;
-
-void dev_execute_tests(kat_instance *tests, unsigned ntests){
-    unsigned i;
-    for(i=0; i<ntests; ++i){
-        kat_instance *ti = &tests[i];
-        genmap_t::iterator p = genmap.find(make_pair(ti->method, ti->nrounds));
-        if(p == genmap.end())
-            throw std::runtime_error("pair<generator, nrounds> not in map.  You probably need to add more genmap entries in kat_cpp.cpp");
-
-        p->second(ti);
-        // TODO: check that the corresponding Engine and MicroURNG
-        //  return the same values.  Note that we have ut_Engine and
-        //  ut_MicroURNG, which check basic functionality, but they
-        //  don't have the breadth of the kat_vectors.
-    }
-}
-
 static int murng_reported;
 static int engine_reported;
 
 template <typename GEN>
-void do_test(kat_instance* ti){
+void base_rng_test_act(kat_instance* ti){
     GEN g;
     struct gdata{
         typename GEN::ctr_type ctr;
@@ -419,7 +321,7 @@ void do_test(kat_instance* ti){
     };
     gdata data;
     // use memcpy.  A reinterpret_cast would violate strict aliasing.
-    memcpy(&data, &ti->u, sizeof(data));
+    std::memcpy(&data, &ti->u, sizeof(data));
     data.computed = g(data.ctr, data.ukey);
 
     // Before we return, let's make sure that MicroURNG<GEN,1> and
@@ -452,7 +354,7 @@ void do_test(kat_instance* ti){
             errs++;
     }
     if(errs && (murng_reported++ == 0))
-        cerr << "Error in MicroURNG<GEN>, will appear as \"computed\" value of zero in error summary\n";
+        std::cerr << "Error in MicroURNG<GEN>, will appear as \"computed\" value of zero in error summary\n";
 
     // Engine
     // N.B.  exercising discard() arguably belongs in ut_Engine.cpp
@@ -499,61 +401,145 @@ void do_test(kat_instance* ti){
 	value_type val = e();
 	size_t j = data.expected.size() - i - 1;
 	if (data.expected[j] != val) {
-            cerr << hex;
-            cerr << "Engine check, j=" << j << " expected: " << data.expected[j] << " val: " << val << "\n";
+            std::cerr << std::hex;
+            std::cerr << "Engine check, j=" << j << " expected: " << data.expected[j] << " val: " << val << "\n";
 	    errs++;
             if(engine_reported++ == 0)
-                cerr << "Error in Engine<GEN, 1>, will appear as \"computed\" value of zero in error summary\n";
+                std::cerr << "Error in Engine<GEN, 1>, will appear as \"computed\" value of zero in error summary\n";
 	}
     }
 
     // Signal an error to the caller by *not* copying back
     // the computed data object into the ti
     if(errs == 0)
-        memcpy(&ti->u, &data, sizeof(data));
+        std::memcpy(&ti->u, &data, sizeof(data));
 }
 
-void host_execute_tests(kat_instance *tests, unsigned ntests){
-    // In C++1x, this could be staticly declared with an initializer list.
-    genmap[make_pair(threefry2x32_e, 13u)] = do_test<r123::Threefry2x32_R<13> >;
-    genmap[make_pair(threefry2x32_e, 20u)] = do_test<r123::Threefry2x32_R<20> >;
-    genmap[make_pair(threefry2x32_e, 32u)] = do_test<r123::Threefry2x32_R<32> >;
-#if R123_USE_64BIT
-    genmap[make_pair(threefry2x64_e, 13u)] = do_test<r123::Threefry2x64_R<13> >;
-    genmap[make_pair(threefry2x64_e, 20u)] = do_test<r123::Threefry2x64_R<20> >;
-    genmap[make_pair(threefry2x64_e, 32u)] = do_test<r123::Threefry2x64_R<32> >;
-#endif
+void base_rng_test_assert(int &nfailed, const kat_instance *tests, unsigned ntests){
+    unsigned i;
+    char zeros[512] = {0};
+    for(i=0; i<ntests; ++i){
+        const kat_instance *ti = &tests[i];
+        switch(tests[i].method){
+#define RNGNxW_TPL(base, N, W) case base##N##x##W##_e: \
+            if (memcmp(zeros, ti->u.base##N##x##W##_data.expected.v, N*W/8)==0){ \
+                FAIL() << "kat expected all zeros?   Something is wrong with the test harness!\n"; \
+                nfailed++; \
+            } \
+            if (memcmp(ti->u.base##N##x##W##_data.computed.v, ti->u.base##N##x##W##_data.expected.v, N*W/8)) \
+		report_##base##N##x##W##error(nfailed, ti); \
+	    break;
+#include "r123_rngNxW.mm"
+#undef RNGNxW_TPL
+        case last: ;
+        }
+    }
+}
 
-    genmap[make_pair(threefry4x32_e, 13u)] = do_test<r123::Threefry4x32_R<13> >;
-    genmap[make_pair(threefry4x32_e, 20u)] = do_test<r123::Threefry4x32_R<20> >;
-    genmap[make_pair(threefry4x32_e, 72u)] = do_test<r123::Threefry4x32_R<72> >;
-#if R123_USE_64BIT
-    genmap[make_pair(threefry4x64_e, 13u)] = do_test<r123::Threefry4x64_R<13> >;
-    genmap[make_pair(threefry4x64_e, 20u)] = do_test<r123::Threefry4x64_R<20> >;
-    genmap[make_pair(threefry4x64_e, 72u)] = do_test<r123::Threefry4x64_R<72> >;
-#endif
+void run_all_base_rng_kats() {
+    kat_instance *tests;
+    unsigned t, ntests = 1000;
+    char linebuf[LINESIZE];
+    FILE *inpfile;
+    const char *p;
+    const char *inname;
+    int nfailed = 0;
 
-    genmap[make_pair(philox2x32_e, 7u)] = do_test<r123::Philox2x32_R<7> >;
-    genmap[make_pair(philox2x32_e, 10u)] = do_test<r123::Philox2x32_R<10> >;
-    genmap[make_pair(philox4x32_e, 7u)] = do_test<r123::Philox4x32_R<7> >;
-    genmap[make_pair(philox4x32_e, 10u)] = do_test<r123::Philox4x32_R<10> >;
+    UnknownKatTracker ukt{};
 
-#if R123_USE_PHILOX_64BIT
-    genmap[make_pair(philox2x64_e, 7u)] = do_test<r123::Philox2x64_R<7> >;
-    genmap[make_pair(philox2x64_e, 10u)] = do_test<r123::Philox2x64_R<10> >;
-    genmap[make_pair(philox4x64_e, 7u)] = do_test<r123::Philox4x64_R<7> >;
-    genmap[make_pair(philox4x64_e, 10u)] = do_test<r123::Philox4x64_R<10> >;
-#endif
+    inname = "./kat_vectors.txt";
+	inpfile = fopen(inname, "r");
+	if (inpfile == NULL)
+        FAIL() << "Error opening input file " << inname << " for reading. Received error code " << errno << "\n";
 
-#if R123_USE_AES_NI
-    genmap[make_pair(aesni4x32_e, 10u)] = do_test<r123::AESNI4x32 >;
-    genmap[make_pair(ars4x32_e, 7u)] = do_test<r123::ARS4x32_R<7> >;
-    genmap[make_pair(ars4x32_e, 10u)] = do_test<r123::ARS4x32_R<10> >;
-#endif
+    if ((p = getenv("KATC_VERBOSE")) != NULL)
+	    verbose = atoi(p);
+  
+    if ((p = getenv("KATC_DEBUG")) != NULL)
+	    debug = atoi(p);
+
+    tests = (kat_instance *) malloc(sizeof(tests[0])*ntests);
+    if (tests == NULL) {
+        FAIL() << "Could not allocate " << (unsigned long) ntests << " bytes for tests\n";
+    }
+    t = 0;
+    while (fgets(linebuf, sizeof linebuf, inpfile) != NULL) {
+        if( t == ntests ) {
+            ntests *= 2;
+            tests = (kat_instance *)realloc(tests, sizeof(tests[0])*ntests);
+            if (tests == NULL) {
+                FAIL() << "Could not grow tests to " << (unsigned long) ntests << " bytes.\n";
+            }
+        }
+        bool flag = false;
+        base_rng_test_arrange(linebuf, &tests[t], ukt, flag);
+        if( flag )
+            ++t;
+    }
+    if(t==ntests){
+	    FAIL() << "No more space for tests?  Recompile with a larger ntests\n";
+    }
+    tests[t].method = last;
+
+    for(int i=0; i< ukt.num_unknowns; ++i){
+        printf("%d test vectors of type %s skipped\n", ukt.unknown_counts[i], ukt.unknown_names[i]);
+    }
+    printf("Perform %lu tests.\n", (unsigned long)t);
+    using std::map;
+    using std::pair;
+    using std::make_pair;
+    typedef map<pair<method_e, unsigned>, void (*)(kat_instance *)> genmap_t;
+    genmap_t genmap;
+    // In C++1x, this could be staticly declared with an initializer list.
+    genmap[make_pair(threefry2x32_e, 13u)] = base_rng_test_act<r123::Threefry2x32_R<13> >;
+    genmap[make_pair(threefry2x32_e, 20u)] = base_rng_test_act<r123::Threefry2x32_R<20> >;
+    genmap[make_pair(threefry2x32_e, 32u)] = base_rng_test_act<r123::Threefry2x32_R<32> >;
+
+    genmap[make_pair(threefry4x32_e, 13u)] = base_rng_test_act<r123::Threefry4x32_R<13> >;
+    genmap[make_pair(threefry4x32_e, 20u)] = base_rng_test_act<r123::Threefry4x32_R<20> >;
+    genmap[make_pair(threefry4x32_e, 72u)] = base_rng_test_act<r123::Threefry4x32_R<72> >;
+
+    #if R123_USE_64BIT
+    genmap[make_pair(threefry2x64_e, 13u)] = base_rng_test_act<r123::Threefry2x64_R<13> >;
+    genmap[make_pair(threefry2x64_e, 20u)] = base_rng_test_act<r123::Threefry2x64_R<20> >;
+    genmap[make_pair(threefry2x64_e, 32u)] = base_rng_test_act<r123::Threefry2x64_R<32> >;
+
+    genmap[make_pair(threefry4x64_e, 13u)] = base_rng_test_act<r123::Threefry4x64_R<13> >;
+    genmap[make_pair(threefry4x64_e, 20u)] = base_rng_test_act<r123::Threefry4x64_R<20> >;
+    genmap[make_pair(threefry4x64_e, 72u)] = base_rng_test_act<r123::Threefry4x64_R<72> >;
+    #endif
+
+    genmap[make_pair(philox2x32_e, 7u)] = base_rng_test_act<r123::Philox2x32_R<7> >;
+    genmap[make_pair(philox2x32_e, 10u)] = base_rng_test_act<r123::Philox2x32_R<10> >;
+    genmap[make_pair(philox4x32_e, 7u)] = base_rng_test_act<r123::Philox4x32_R<7> >;
+    genmap[make_pair(philox4x32_e, 10u)] = base_rng_test_act<r123::Philox4x32_R<10> >;
+
+    #if R123_USE_PHILOX_64BIT
+    genmap[make_pair(philox2x64_e, 7u)] = base_rng_test_act<r123::Philox2x64_R<7> >;
+    genmap[make_pair(philox2x64_e, 10u)] = base_rng_test_act<r123::Philox2x64_R<10> >;
+    genmap[make_pair(philox4x64_e, 7u)] = base_rng_test_act<r123::Philox4x64_R<7> >;
+    genmap[make_pair(philox4x64_e, 10u)] = base_rng_test_act<r123::Philox4x64_R<10> >;
+    #endif
+
+    unsigned i;
+    for(i=0; i<t; ++i){
+        kat_instance *ti = &tests[i];
+        genmap_t::iterator p = genmap.find(make_pair(ti->method, ti->nrounds));
+        if(p == genmap.end())
+            throw std::runtime_error("pair<generator, nrounds> not in map.  You probably need to add more genmap entries.");
+
+        p->second(ti);
+        // ^ That prepares the test data
+    }
 
-    dev_execute_tests(tests, ntests);
+    base_rng_test_assert(nfailed, tests, t);
+    free(tests);
+    if(nfailed != 0)
+        FAIL() << "Failed " << nfailed << " out of " << t << std::endl;
+    return;
 }
 
+// MARK: histogram test
 
 using namespace r123;
 
@@ -567,30 +553,6 @@ typename r123::make_signed<T>::type S(T x){ return x; }
         chk<Ftype, Rng>(#u, #Rng, #Ftype, &u<Ftype, Rng::ctr_type::value_type>, _nfail_, _refhist_); \
     }while(0)
 
-std::map<std::string, std::string> get_refmap(){
-    using std::string;
-    std::map<string, string> refmap{};
-    refmap[string("u01 Threefry4x32 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x32 double")]= string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x32 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01 Threefry4x64 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01 Threefry4x64 double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01 Threefry4x64 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("uneg11 Threefry4x32 float")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x32 double")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x32 long double")] = string( " 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
-    refmap[string("uneg11 Threefry4x64 float")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("uneg11 Threefry4x64 double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("uneg11 Threefry4x64 long double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
-    refmap[string("u01fixedpt Threefry4x32 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x32 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x32 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
-    refmap[string("u01fixedpt Threefry4x64 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01fixedpt Threefry4x64 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    refmap[string("u01fixedpt Threefry4x64 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
-    return refmap;
-}
-
 template<typename Ftype, typename RNG, typename Utype>
 void chk(const std::string& fname, const std::string& rngname, const std::string& ftypename, Utype f, int &nfail, std::map<std::string, std::string> &refmap){
     std::string key = fname + " " + rngname + " " + ftypename;
@@ -632,8 +594,32 @@ void chk(const std::string& fname, const std::string& rngname, const std::string
     }
 }
 
+std::map<std::string, std::string> get_ut_uniform_refmap(){
+    using std::string;
+    std::map<string, string> refmap{};
+    refmap[string("u01 Threefry4x32 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x32 double")]= string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x32 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01 Threefry4x64 float")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01 Threefry4x64 double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01 Threefry4x64 long double")] = string(" 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("uneg11 Threefry4x32 float")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x32 double")] = string(" 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x32 long double")] = string( " 156 139 148 146 159 148 159 168 142 160 156 161 153 143 158 150 180 174 152 163 157 129 166 151 140 142");
+    refmap[string("uneg11 Threefry4x64 float")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("uneg11 Threefry4x64 double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("uneg11 Threefry4x64 long double")] = string( " 159 141 148 184 162 142 155 137 173 187 153 140 135 164 144 146 149 151 171 152 148 137 179 146 145 152");
+    refmap[string("u01fixedpt Threefry4x32 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x32 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x32 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 301 330 326 320 295 291 298 287 305 307 310 316 314");
+    refmap[string("u01fixedpt Threefry4x64 float")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01fixedpt Threefry4x64 double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    refmap[string("u01fixedpt Threefry4x64 long double")] = string( " 0 0 0 0 0 0 0 0 0 0 0 0 0 308 295 322 300 316 291 311 289 346 297 310 340 275");
+    return refmap;
+}
+
 void run_ut_uniform(){
-    auto refmap = get_refmap();
+    auto refmap = get_ut_uniform_refmap();
     int nfail = 0;
     // 18 tests:  3 functions (u01, uneg11, u01fixedpt)
     //          x 2 input sizes (32 bit or 64 bit)
@@ -672,10 +658,12 @@ void run_ut_uniform(){
     return;
 }
 
+// MARK: Test class and fixtures
+
 class TestRandom123KnownAnswers : public ::testing::Test { };
 
 TEST_F(TestRandom123KnownAnswers, base_generators) {
-    run_base_rng_kat();
+    run_all_base_rng_kats();
 }
 
 TEST_F(TestRandom123KnownAnswers, uniform_histograms) {

From f209a07f1da3edde2abfb8c63c1e48db1c794431 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 26 Jul 2024 09:22:19 -0400
Subject: [PATCH 02/37] some renaming, tweak comments, clean up CMakeLists.txt

---
 test/CMakeLists.txt                           | 16 +++-----
 test/DevNotes.md                              |  6 +--
 .../{kat_vectors.txt => r123_kat_vectors.txt} |  0
 ...est_sample_indices.cc => test_discrete.cc} |  0
 .../{test_r123_kat.cc => test_r123.cc}        | 39 ++++++++++++-------
 5 files changed, 32 insertions(+), 29 deletions(-)
 rename test/test_basic_rng/{kat_vectors.txt => r123_kat_vectors.txt} (100%)
 rename test/test_basic_rng/{test_sample_indices.cc => test_discrete.cc} (100%)
 rename test/test_basic_rng/{test_r123_kat.cc => test_r123.cc} (96%)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c1c39ed7..1ccf04e1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,11 +12,9 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(RandBLAS_tests
-        comparison.hh
         test_datastructures/test_denseskop.cc
         test_datastructures/test_sparseskop.cc
 
-        test_matmul_cores/linop_common.hh
         test_matmul_cores/test_lskge3.cc
         test_matmul_cores/test_rskge3.cc
         test_matmul_cores/test_lskges.cc
@@ -35,15 +33,11 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(SparseRandBLAS_tests
-        comparison.hh
-
-        test_datastructures/test_spmats/common.hh
         test_datastructures/test_spmats/test_csc.cc
         test_datastructures/test_spmats/test_csr.cc
         test_datastructures/test_spmats/test_coo.cc
         test_datastructures/test_spmats/test_conversions.cc
 
-        test_matmul_cores/test_spmm/spmm_test_helpers.hh
         test_matmul_cores/test_spmm/test_spmm_csc.cc
         test_matmul_cores/test_spmm/test_spmm_csr.cc
         test_matmul_cores/test_spmm/test_spmm_coo.cc
@@ -58,14 +52,14 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(RandBLAS_stats
-        test_basic_rng/rng_common.hh
-        test_basic_rng/test_sample_indices.cc
-        test_basic_rng/test_r123_kat.cc
+        test_basic_rng/test_continuous.cc
+        test_basic_rng/test_discrete.cc
+        test_basic_rng/test_r123.cc
     )
     target_link_libraries(RandBLAS_stats RandBLAS GTest::GTest GTest::Main)
     gtest_discover_tests(RandBLAS_stats)
-    file(COPY test_basic_rng/kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
-    file(COPY test_basic_rng/kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
 
 endif()
 message(STATUS "Checking for regression tests ... ${tmp}")
diff --git a/test/DevNotes.md b/test/DevNotes.md
index ed1ff934..5e5b1dd2 100644
--- a/test/DevNotes.md
+++ b/test/DevNotes.md
@@ -30,13 +30,13 @@ but I haven't actually verified this.
 
 ### test_basic_rng
 
-  * test_r123_kat.cc has deterministic tests for Random123. The tests comapre generated values
+  * test_r123.cc has deterministic tests for Random123. The tests comapre generated values
     to reference values computed ahead of time. The tests are __extremely__ messy, since they're
     adapted from tests in the official Random123 repository, and Random123 needs to handle a far wider
     range of compilers and languages than we assume for RandBLAS.
 
-  * test_sample_indices.cc includes statistical tests for sampling from an index set with or without
-    replacement. 
+  * test_discrete.cc includes statistical tests for sampling from an index set with or without
+    replacement.
 
   * rng_common.hh includes data for statistical tables (e.g., for Kolmogorov-Smirnov tests) and helper
     functions to compute quantities associated with certain probability distributions (e.g., mean
diff --git a/test/test_basic_rng/kat_vectors.txt b/test/test_basic_rng/r123_kat_vectors.txt
similarity index 100%
rename from test/test_basic_rng/kat_vectors.txt
rename to test/test_basic_rng/r123_kat_vectors.txt
diff --git a/test/test_basic_rng/test_sample_indices.cc b/test/test_basic_rng/test_discrete.cc
similarity index 100%
rename from test/test_basic_rng/test_sample_indices.cc
rename to test/test_basic_rng/test_discrete.cc
diff --git a/test/test_basic_rng/test_r123_kat.cc b/test/test_basic_rng/test_r123.cc
similarity index 96%
rename from test/test_basic_rng/test_r123_kat.cc
rename to test/test_basic_rng/test_r123.cc
index 4487f732..74132778 100644
--- a/test/test_basic_rng/test_r123_kat.cc
+++ b/test/test_basic_rng/test_r123.cc
@@ -139,11 +139,19 @@ do { \
     fflush(fp); \
 } while(0)
 
-// MARK: code generation
+// MARK: Base generator test
+//
+// There's a lot of code involved in this test. The code can roughly
+// be broken down into three categories.
+//
+// Category 1: code generated from compiler directives
 //
-// The code from here until the next MARK uses a compiler directive
-// metaprogramming technique to define a bunch of similarly named
-// identifiers. The idenfiers are ...
+//   This pattern is left over from our adaptation of Random123 tests,
+//   which have to compile whether interpreted as C or C++ source. It
+//   uses compiler directives to accomplish what something roughly 
+//   equivalent to C++ templating and metaprogramming.
+//
+//   The code specifically generates the following identifiers.
 //
 //      method_e::<GEN>NxW_e        (enum members)
 //      <GEN>NxW_kat                (structs)
@@ -151,9 +159,16 @@ do { \
 //      read_<GEN>NxW               (functions)
 //      report_<GEN>NxWerror        (functions)
 //
-// This code pattern was needed when the tests had to compile as 
-// valid C programs. We've changed style very slightly to reflect
-// our specialization to C++.
+// Category 2: helper functions
+//
+//   The base_rng_test_[arrange,act,assert] functions are slight adaptations
+//   of functions that appeared in Random123 testing infrastructure. Their 
+//   names indicte their roles in the common "arrange, act, assert" pattern of
+//   writing unit tests. Their precise descriptions are complicated.
+//   
+// Category 3: the main runner
+//
+//   This manages all calls to the helper functions defined in Category 2.
 //
 
 enum method_e{
@@ -251,12 +266,6 @@ void report_##base##N##x##W##error(int &nfailed, const kat_instance *ti){ \
 #include "r123_rngNxW.mm"
 #undef RNGNxW_TPL
 
-// MARK: Base generator test
-//
-//  Helper functions here are modified from the original Random123 tests
-//  follow the "arrange, act, assert" pattern of unit testing. 
-//
-
 struct UnknownKatTracker {
     const static int MAXUNKNOWNS = 20;
     int num_unknowns = 0;
@@ -447,7 +456,7 @@ void run_all_base_rng_kats() {
 
     UnknownKatTracker ukt{};
 
-    inname = "./kat_vectors.txt";
+    inname = "./r123_kat_vectors.txt";
 	inpfile = fopen(inname, "r");
 	if (inpfile == NULL)
         FAIL() << "Error opening input file " << inname << " for reading. Received error code " << errno << "\n";
@@ -658,7 +667,7 @@ void run_ut_uniform(){
     return;
 }
 
-// MARK: Test class and fixtures
+// MARK: Googletest stuff
 
 class TestRandom123KnownAnswers : public ::testing::Test { };
 

From 1d210a02cdefeda6498af0b1d8024a9f55e7e317 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 11:51:16 -0400
Subject: [PATCH 03/37] inline non-templated functions in rng_common.hh. Add a
 test for incrementing random123 Nx32 arrays by 64 bit integers.

---
 test/test_basic_rng/rng_common.hh |  57 ++++++++++++----
 test/test_basic_rng/test_r123.cc  | 107 ++++++++++++++++++++++++++++--
 2 files changed, 146 insertions(+), 18 deletions(-)

diff --git a/test/test_basic_rng/rng_common.hh b/test/test_basic_rng/rng_common.hh
index f1103b4b..e344d93c 100644
--- a/test/test_basic_rng/rng_common.hh
+++ b/test/test_basic_rng/rng_common.hh
@@ -1,3 +1,32 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
 #pragma once
 
 #include "RandBLAS.hh"
@@ -15,24 +44,24 @@ namespace KolmogorovSmirnovConstants {
 
 /*** From scipy.stats:  critical_value = kstwo.ppf(1-significance, sample_size) */
 
-const int SMALLEST_SAMPLE = 8;
-const int LARGEST_SAMPLE = 16777216;
+inline const int SMALLEST_SAMPLE = 8;
+inline const int LARGEST_SAMPLE = 16777216;
 
-const std::vector<int> sample_sizes {
+inline const std::vector<int> sample_sizes {
           8,       16,       32,       64,      128,      256,
         512,     1024,     2048,     4096,     8192,    16384,
       32768,    65536,   131072,   262144,   524288,  1048576,
     2097152,  4194304,  8388608, 16777216
 };
 
-const double WEAKEST_SIGNIFICANCE = 0.05;
-const double STRONGEST_SIGNIFICANCE = 1e-6;
+inline const double WEAKEST_SIGNIFICANCE = 0.05;
+inline const double STRONGEST_SIGNIFICANCE = 1e-6;
 
-const std::vector<double> significance_levels {
+inline const std::vector<double> significance_levels {
     0.05, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6
 };
 
-const std::array<const std::array<double, 22>, 6> critical_values {{{
+inline const std::array<const std::array<double, 22>, 6> critical_values {{{
     // significance of 0.05
     4.54266591e-01, 3.27333470e-01, 2.34240860e-01, 1.66933746e-01,
     1.18658276e-01, 8.42018587e-02, 5.96844982e-02, 4.22742678e-02,
@@ -89,7 +118,7 @@ const std::array<const std::array<double, 22>, 6> critical_values {{{
  * The correctness of this function depends on significance_levels being sorted
  * in decreasing order (which corresponds to weakest to strongest significances).
  */
-int significance_rep(double sig) {
+inline int significance_rep(double sig) {
     randblas_require(STRONGEST_SIGNIFICANCE <= sig && sig <= WEAKEST_SIGNIFICANCE);
     int num_siglevels = (int) significance_levels.size();
     for (int i = 0; i < num_siglevels; ++i) {
@@ -107,7 +136,7 @@ int significance_rep(double sig) {
  * The correctness of this function depends on sample_sizes being sorted in
  * increasing order.
  */
-int sample_size_rep(int n) {
+inline int sample_size_rep(int n) {
     randblas_require(SMALLEST_SAMPLE <= n && n <= LARGEST_SAMPLE);
     int num_sample_sizes = (int) sample_sizes.size();
     for (int i = 0; i < num_sample_sizes; ++i) {
@@ -119,7 +148,7 @@ int sample_size_rep(int n) {
     return -1;
 }
 
-std::tuple<double,int,double> critical_value_rep(int n, double sig) {
+inline std::tuple<double,int,double> critical_value_rep(int n, double sig) {
     int i = significance_rep(sig);
     auto override_sig = significance_levels[i];
     int j = sample_size_rep(n);
@@ -144,7 +173,7 @@ double critical_value_rep_mutator(TI &n, double &sig) {
 // MARK: combinatorics
 //
 
-double log_binomial_coefficient(int64_t n, int64_t k) {
+inline double log_binomial_coefficient(int64_t n, int64_t k) {
     double result = 0.0;
     for (int64_t i = 1; i <= k; ++i) {
         result += std::log(static_cast<double>(n - i + 1)) - std::log(static_cast<double>(i));
@@ -165,7 +194,7 @@ double log_binomial_coefficient(int64_t n, int64_t k) {
  *      This function returns the probability that the sample of D items will contain observed_k elements
  *      from the distinguished set.
  */
-double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k) {
+inline double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k) {
     randblas_require(0 <= K && K <= N);
     randblas_require(0 <= D && D <= N);
     randblas_require(0 <= observed_k && observed_k <= K);
@@ -176,14 +205,14 @@ double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observed_k) {
     return out;
 }
 
-double hypergeometric_mean(int64_t N, int64_t K, int64_t D) {
+inline double hypergeometric_mean(int64_t N, int64_t K, int64_t D) {
     double dN = (double) N;
     double dK = (double) K;
     double dD = (double) D;
     return dD * dK / dN;
 }
 
-double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
+inline double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
     double dN = (double) N;
     double dK = (double) K;
     double dD = (double) D;
diff --git a/test/test_basic_rng/test_r123.cc b/test/test_basic_rng/test_r123.cc
index 74132778..da17842d 100644
--- a/test/test_basic_rng/test_r123.cc
+++ b/test/test_basic_rng/test_r123.cc
@@ -54,6 +54,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <utility>
 #include <stdexcept>
 
+#include <RandBLAS/base.hh>
+#include <RandBLAS/random_gen.hh>
+
 #include <Random123/philox.h>
 #include <Random123/threefry.h>
 #include <Random123/uniform.hpp>
@@ -667,14 +670,110 @@ void run_ut_uniform(){
     return;
 }
 
-// MARK: Googletest stuff
+// MARK: my tests + Googletest
 
-class TestRandom123KnownAnswers : public ::testing::Test { };
+class TestRandom123 : public ::testing::Test { 
 
-TEST_F(TestRandom123KnownAnswers, base_generators) {
+    protected:
+    
+    static void test_incr() {
+        using RNG = r123::Philox4x32;
+        RandBLAS::RNGState<RNG> s(0);
+        // The "counter" array of s is a 4*32=128 bit unsigned integer.
+        //
+        //      Each block is interpreted in the usual way (i.e., no need to consider differences
+        //      between big-endian and little-endian representations). 
+        //
+        //      Looking across blocks, we read as as a little-endian number in base IMAX = 2^32 - 1.
+        //      That is, if we initialize s.counter = {0,0,0,0} and then call s.counter.incr(IMAX),
+        //      we should have s.counter = {IMAX, 0, 0, 0}, and if we make another call 
+        //      s.counter.incr(9), then we should see s.counter = {8, 1, 0, 0}. Put another way,
+        //      if c = s.counter, then we have
+        //
+        //        (128-bit integer) c == c[0] + 2^{32}*c[1] +  2^{64}*c[2] + 2^{96}*c[3]  (mod 2^128 - 1)
+        //
+        //       where 0 <= c[i] <= IMAX
+        //
+        uint64_t i32max = std::numeric_limits<uint32_t>::max();
+        auto c = s.counter;
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(i32max);
+        ASSERT_EQ(c[0], i32max);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(1);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c.incr(3);
+        ASSERT_EQ(c[0], 3);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        uint64_t two32  = ((uint64_t) 1) << 32;
+
+        c = {0,0,0,0};
+        c.incr(two32-1);
+        ASSERT_EQ(c[0], i32max);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        c = {0,0,0,0};
+        c.incr(two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 1);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+
+        // Let's construct 2^32 * (2^32 - 1), which is equal to (ctr_type) {0, (uint32_t) i32max, 0, 0}.
+        //
+        //  Do this using the identity 
+        //      2^32 * (2^32 - 1) == 2^64 - 2^32
+        //                        == 2^63 + 2^63 - 2^32.
+        //
+        // Then construct 2^64, using 2^64 = (2^63) + (2^63 - 2^32) + (2^32)
+        uint64_t two63  = ((uint64_t) 1) << 63;
+        c = {0,0,0,0};
+        c.incr(two63);
+        c.incr(two63 - two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], i32max);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 0);
+        c.incr(two32);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 1);
+        ASSERT_EQ(c[3], 0);
+
+        c = {(uint32_t) i32max, (uint32_t) i32max, (uint32_t) i32max, 0};
+        c.incr(1);
+        ASSERT_EQ(c[0], 0);
+        ASSERT_EQ(c[1], 0);
+        ASSERT_EQ(c[2], 0);
+        ASSERT_EQ(c[3], 1);
+        return;
+    }
+};
+
+TEST_F(TestRandom123, base_generators) {
     run_all_base_rng_kats();
 }
 
-TEST_F(TestRandom123KnownAnswers, uniform_histograms) {
+TEST_F(TestRandom123, uniform_histograms) {
     run_ut_uniform();
 }
+
+TEST_F(TestRandom123, big_incr) {
+    test_incr();
+}
\ No newline at end of file

From d92eb501a0f6e127886b9d40a9f4d0d234b34ce8 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 11:56:49 -0400
Subject: [PATCH 04/37] Add safe_signed_int_product function (to check for
 overflows when indexing into an implicit array by a potentially very-large
 number). Moved SparseSkOp and DenseSkOp constructor and destructor
 implementations into class definitions, rather than only declaring function
 signatures in the class definitions. Changed next_state for DenseSkOp and
 SparseSkOp to be const. Add shell/unimplemented compute_next_state functions
 for DenseSkOp and SparseSkOp for computing the correct value for next_state
 when a sketching operator is constructed. Add a fill_dense overload that lets
 you specify a required buffer layout (this function allocates workspace for
 an out-of-place transpose if needed). Add an omatcopy function for
 out-of-place stride changes that doesnt do any bound checking.

---
 RandBLAS/base.hh         |  23 +++-
 RandBLAS/dense_skops.hh  | 210 ++++++++++++++++++++++++--------
 RandBLAS/skge.hh         |  20 +--
 RandBLAS/sparse_skops.hh | 257 ++++++++++++++++++++-------------------
 RandBLAS/util.hh         |  27 ++--
 5 files changed, 343 insertions(+), 194 deletions(-)

diff --git a/RandBLAS/base.hh b/RandBLAS/base.hh
index e372d002..d4c00a30 100644
--- a/RandBLAS/base.hh
+++ b/RandBLAS/base.hh
@@ -118,6 +118,23 @@ template<typename T>
 concept SignedInteger = (std::numeric_limits<T>::is_signed && std::numeric_limits<T>::is_integer);
 
 
+template <SignedInteger TI, SignedInteger TO = int64_t>
+inline TO safe_signed_int_product(TI a, TI b) {
+    if (a == 0 || b == 0) {
+        return 0;
+    }
+    TO c = a * b;
+    TO b_check = c / a;
+    TO a_check = c / b;
+    if ((a_check != a) || (b_check != b)) {
+        std::stringstream s;
+        s << "Overflow when multiplying a (=" << a << ") and b(=" << b << "), which resulted in " << c << ".\n";
+        throw std::overflow_error(s.str());
+    }
+    return c;
+}
+
+
 enum class MajorAxis : char {
     // ---------------------------------------------------------------------------
     ///  short-axis vectors (cols of a wide matrix, rows of a tall matrix)
@@ -125,7 +142,11 @@ enum class MajorAxis : char {
 
     // ---------------------------------------------------------------------------
     ///  long-axis vectors (rows of a wide matrix, cols of a tall matrix)
-    Long = 'L'
+    Long = 'L',
+
+    // ---------------------------------------------------------------------------
+    ///  Undefined (used when row-major vs column-major must be explicit)
+    Undefined = 'U'
 };
 
 
diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index ac92ecbd..c7478ac3 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -31,6 +31,7 @@
 #include "RandBLAS/base.hh"
 #include "RandBLAS/exceptions.hh"
 #include "RandBLAS/random_gen.hh"
+#include "RandBLAS/util.hh"
 
 #include <blas.hh>
 
@@ -60,6 +61,12 @@ bool compare_ctr(typename RNG::ctr_type c1, typename RNG::ctr_type c2) {
     return false;
 }
 
+template <typename RNG, typename DD>
+static inline RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
+    // Need logic that depends on DenseDistName.
+    return RNGState<RNG>(0);
+}
+
 /** 
  * Fill buff with random values so it gives a row-major representation of an n_srows \math{\times} n_scols
  * submatrix of some implicitly defined parent matrix.
@@ -165,15 +172,16 @@ static RNGState<RNG> fill_dense_submat_impl(
     int64_t thrd = 0;
 
     #pragma omp for
-    for (int row = 0; row < n_srows; row++) {
+    for (int64_t row = 0; row < n_srows; row++) {
         
         #if defined(RandBLAS_HAS_OpenMP)
             thrd = omp_get_thread_num();
         #endif
 
         ind = 0;
-        r0 = r0_padded + ctr_gap*row;
-        r1 = r1_padded + ctr_gap*row; 
+        int64_t __r01_offset = safe_signed_int_product(ctr_gap, row);
+        r0 = r0_padded + __r01_offset;
+        r1 = r1_padded + __r01_offset; 
 
         cc.incr(r0 - prev);
         prev = r0;
@@ -314,17 +322,20 @@ struct DenseDist {
         int64_t n_rows,
         int64_t n_cols,
         DenseDistName dn = DenseDistName::Uniform
-    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(MajorAxis::Long) {
-        randblas_require(dn != DenseDistName::BlackBox);
-    };
+    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis( (dn == DenseDistName::BlackBox) ? MajorAxis::Undefined : MajorAxis::Long) { }
 
-    // Only use with struct initializer.
     DenseDist(
         int64_t n_rows,
         int64_t n_cols,
         DenseDistName dn,
         MajorAxis ma
-    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(ma) { };
+    ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis(ma) {
+        if (dn == DenseDistName::BlackBox) {
+            randblas_require(ma == MajorAxis::Undefined);
+        } else {
+            randblas_require(ma != MajorAxis::Undefined);
+        }  
+    }
 
 };
 
@@ -332,6 +343,7 @@ struct DenseDist {
 inline blas::Layout dist_to_layout(
     DenseDist D
 ) {
+    randblas_require(D.major_axis != MajorAxis::Undefined);
     bool is_wide = D.n_rows < D.n_cols;
     bool fa_long = D.major_axis == MajorAxis::Long;
     if (is_wide && fa_long) {
@@ -348,6 +360,7 @@ inline blas::Layout dist_to_layout(
 inline int64_t major_axis_length(
     DenseDist D
 ) {
+    randblas_require(D.major_axis != MajorAxis::Undefined);
     return (D.major_axis == MajorAxis::Long) ? 
         std::max(D.n_rows, D.n_cols) : std::min(D.n_rows, D.n_cols);
 }
@@ -379,7 +392,7 @@ struct DenseSkOp {
     // ---------------------------------------------------------------------------
     ///  The state that should be used by the next call to an RNG *after* the
     ///  full sketching operator has been sampled.
-    RNGState<RNG> next_state;
+    const RNGState<RNG> next_state;
 
     T *buff = nullptr;                         // memory
     const blas::Layout layout;                 // matrix storage order
@@ -391,12 +404,28 @@ struct DenseSkOp {
     //
     /////////////////////////////////////////////////////////////////////
 
-    //  Elementary constructor: needs an implementation
     DenseSkOp(
         DenseDist dist,
         RNGState<RNG> const &state,
-        T *buff
-    );
+        T *buff,
+        blas::Layout forced_layout
+    ) : // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        next_state(dense::compute_next_state(dist, state)),
+        buff(buff),
+        layout(forced_layout)
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        if (dist.family == DenseDistName::BlackBox)
+            randblas_require(this->buff != nullptr);
+    }
+
+    DenseSkOp(DenseDist dist, RNGState<RNG> const &state, T *buff
+    ) : DenseSkOp(dist, state, buff, dist_to_layout(dist)) {}
 
     ///---------------------------------------------------------------------------
     /// The preferred constructor for DenseSkOp objects. There are other 
@@ -435,35 +464,12 @@ struct DenseSkOp {
     ) : DenseSkOp(DenseDist{n_rows, n_cols, family, ma}, RNGState<RNG>(key), buff) {};
 
     // Destructor
-    ~DenseSkOp();
+    ~DenseSkOp() {
+        if (this->del_buff_on_destruct)
+            delete [] this->buff;
+    }
 };
 
-template <typename T, typename RNG>
-DenseSkOp<T,RNG>::DenseSkOp(
-    DenseDist dist,
-    RNGState<RNG> const &state,
-    T *buff
-) : // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    next_state{},
-    buff(buff),
-    layout(dist_to_layout(dist))
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    if (dist.family == DenseDistName::BlackBox)
-        randblas_require(this->buff != nullptr);
-}
-
-template <typename T, typename RNG>
-DenseSkOp<T,RNG>::~DenseSkOp() {
-    if (this->del_buff_on_destruct) {
-        delete [] this->buff;
-    }
-}
 
 // =============================================================================
 /// @verbatim embed:rst:leading-slashes
@@ -519,8 +525,10 @@ DenseSkOp<T,RNG>::~DenseSkOp() {
 ///     - \math{\buff} must be read in "layout" order 
 ///       to recover \math{\mat(\buff)}. This layout is determined
 ///       from \math{\D} and cannot be controlled directly.
-///     - If this function returns a layout that is undesirable then it is
-///       the caller's responsibility to perform a transpose as needed.
+///     - If you want to control the layout (rather than be at the mercy
+///       of \math{\D}) then you can call the overload of this function
+///       that takes a layout as its first argument. That overload will
+///       allocate memory internally to perform an out-of-place transpose.
 /// 
 template<typename T, typename RNG>
 std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
@@ -542,11 +550,11 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
         // operate on the transpose in row-major
         n_rows_ = n_cols;
         n_cols_ = n_rows;
-        ptr = ro_s + co_s * ma_len;
+        ptr = ro_s + safe_signed_int_product(co_s, ma_len);
     } else {
         n_rows_ = n_rows;
         n_cols_ = n_cols;
-        ptr = ro_s * ma_len + co_s;
+        ptr = safe_signed_int_product(ro_s, ma_len) + co_s;
     }
     switch (D.family) {
         case DenseDistName::Gaussian: {
@@ -558,13 +566,99 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
             return std::make_pair(layout, next_state_u);
         }
         case DenseDistName::BlackBox: {
-            throw std::invalid_argument(std::string("fill_buff cannot be called with the BlackBox distribution."));
+            throw std::invalid_argument(std::string("fill_dense cannot be called with the BlackBox distribution."));
         }
         default: {
             throw std::runtime_error(std::string("Unrecognized distribution."));
         }
     }
 }
+
+
+// =============================================================================
+/// @verbatim embed:rst:leading-slashes
+///
+///   .. |mat|   mathmacro:: \operatorname{mat}
+///   .. |buff|  mathmacro:: \mathtt{buff}
+///   .. |D|     mathmacro:: \mathcal{D}
+///   .. |nrows| mathmacro:: \mathtt{n\_rows}
+///   .. |ncols| mathmacro:: \mathtt{n\_cols}
+///   .. |ioff| mathmacro:: \mathtt{i\_off}
+///   .. |joff| mathmacro:: \mathtt{j\_off}
+///
+/// @endverbatim
+/// Fill \math{\buff} so that (1) \math{\mat(\buff)} is a submatrix of
+/// an _implicit_ random sample from \math{\D}, and (2) \math{\mat(\buff)}
+/// is determined by reading from \math{\buff} in "layout" order.
+/// 
+/// If we denote the implicit sample from \math{\D} by \math{S}, then we have
+/// @verbatim embed:rst:leading-slashes
+/// .. math::
+///     \mat(\buff) = S[\ioff:(\ioff + \nrows),\, \joff:(\joff + \ncols)]
+/// @endverbatim
+/// on exit. If the specified layout is different from the natural layout
+/// associated with \math{\D} (via its MajorAxis and dimensions) then we will
+/// allocate extra workspace needed for an out-of-place transposition.
+///
+/// Here's an example use-case.
+///
+///     Suppose you have a DenseSkOp object "S", and you want to fill a buffer called "work"
+///     in "layout" order, so that its contents match the submatrix of S with dimensions
+///     (rows_subsmat_S, cols_submat_S) whose upper-left corner is offset by (ro_s, co_s)
+///     from the upper-left corner of S. This could be done with the following function call:
+///
+///     fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
+///
+/// @param[in] layout
+///     blas::Layout::RowMajor or blas::Layout::ColMajor
+///      - The storage order for \math{\mat(\buff)} on exit.
+/// @param[in] D
+///      A DenseDist object.
+///      - A distribution over random matrices of shape (D.n_rows, D.n_cols).
+/// @param[in] n_rows
+///      A positive integer.
+///      - The number of rows in \math{\mat(\buff)}.
+/// @param[in] n_cols
+///      A positive integer.
+///      - The number of columns in \math{\mat(\buff)}.
+/// @param[in] ro_s
+///      A nonnegative integer.
+///      - The row offset for \math{\mat(\buff)} as a submatrix of \math{S}. 
+///      - We require that \math{\ioff + \nrows} is at most D.n_rows.
+/// @param[in] co_s
+///      A nonnegative integer.
+///      - The column offset for \math{\mat(\buff)} as a submatrix of \math{S}. 
+///      - We require that \math{\joff + \ncols} is at most D.n_cols.
+/// @param[in] buff
+///     Buffer of type T.
+///     - Length must be at least \math{\nrows \cdot \ncols}.
+/// @param[in] seed
+///      A CBRNG state
+///      - Used to define \math{S} as a sample from \math{\D}.
+/// 
+template<typename T, typename RNG>
+RandBLAS::RNGState<RNG> fill_dense(
+    blas::Layout required_layout,
+    const DenseDist &D,
+    int64_t n_rows,
+    int64_t n_cols,
+    int64_t ro_s,
+    int64_t co_s,
+    T* buff,
+    const RNGState<RNG> &seed
+) {
+    int64_t size_mat = n_rows * n_cols;
+    auto [natural_layout, next_state] = fill_dense(D, n_rows, n_cols, ro_s, co_s, buff, seed);
+    if (required_layout != natural_layout) {
+        T* flip_work = new T[size_mat];
+        blas::copy(size_mat, buff, 1, flip_work, 1);
+        auto [irs_nat, ics_nat] = layout_to_strides(natural_layout, n_rows, n_cols);
+        auto [irs_req, ics_req] = layout_to_strides(required_layout, n_rows, n_cols);
+        util::omatcopy(n_rows, n_cols, flip_work, irs_nat, ics_nat, buff, irs_req, ics_req);
+        delete [] flip_work;
+    }
+    return next_state;
+}
  
 // =============================================================================
 /// @verbatim embed:rst:leading-slashes
@@ -625,16 +719,32 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
 ///     algorithm.
 ///    
 template <typename T, typename RNG>
-RNGState<RNG> fill_dense(
+void fill_dense(
     DenseSkOp<T,RNG> &S
 ) {
-    randblas_require(!S.buff);
+    randblas_require(S.buff == nullptr);
     randblas_require(S.dist.family != DenseDistName::BlackBox);
-    S.buff = new T[S.dist.n_rows * S.dist.n_cols];
-    auto [layout, next_state] = fill_dense<T, RNG>(S.dist, S.buff, S.seed_state);
-    S.next_state = next_state;
+    int64_t m = S.dist.n_rows;
+    int64_t n = S.dist.n_cols;
+    int64_t size_S = m * n;
+    blas::Layout nat_layout = dist_to_layout(S.dist);
+    blas::Layout req_layout = S.layout;
+    // ^ Natural and requested layouts
+     S.buff = new T[size_S];
+
+    if (req_layout != nat_layout) {
+        T* work = new T[size_S];
+        fill_dense<T, RNG>(S.dist, work, S.seed_state);
+        auto [irs_nat, ics_nat] = layout_to_strides(nat_layout, m, n);
+        auto [irs_req, ics_req] = layout_to_strides(req_layout, m, n);
+        RandBLAS::util::omatcopy(m, n, work, irs_nat, ics_nat, S.buff, irs_req, ics_req);
+        delete [] work;
+    } else {
+        fill_dense<T, RNG>(S.dist, S.buff, S.seed_state);
+    }
     S.del_buff_on_destruct = true;
-    return next_state;
+
+    return;
 }
 
 }  // end namespace RandBLAS
diff --git a/RandBLAS/skge.hh b/RandBLAS/skge.hh
index 61590105..943dba14 100644
--- a/RandBLAS/skge.hh
+++ b/RandBLAS/skge.hh
@@ -192,12 +192,12 @@ void lskge3(
     if (!S.buff) {
         // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
         // of that sketching operator, and then call LSKGE3 with that new object.
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
+        T *work = new T[rows_submat_S * cols_submat_S];
+        fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
+        DenseDist D_{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, MajorAxis::Undefined};
+        DenseSkOp S_(D_, S.seed_state, work, S.layout);
         lskge3(layout, opS, opA, d, n, m, alpha, S_, 0, 0, A, lda, beta, B, ldb);
-        delete [] buff;
+        delete [] work;
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
@@ -345,12 +345,12 @@ void rskge3(
     if (!S.buff) {
         // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
         // of that sketching operator, and then call RSKGE3 with that new object.
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
+        T *work = new T[rows_submat_S * cols_submat_S];
+        fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
+        DenseDist D_{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, MajorAxis::Undefined};
+        DenseSkOp S_(D_, S.seed_state, work, S.layout);
         rskge3(layout, opA, opS, m, d, n, alpha, A, lda, S_, 0, 0, beta, B, ldb);
-        delete [] buff;
+        delete [] work;
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index fe7c8aab..cd2ad3cc 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -44,6 +44,69 @@
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
+namespace RandBLAS::sparse {
+
+template <typename RNG, typename SD>
+static RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> seed_state) {
+    return RNGState<RNG>(0);
+}
+
+// =============================================================================
+/// WARNING: this function is not part of the public API.
+///
+template <typename T, typename RNG, SignedInteger sint_t>
+static RNGState<RNG> repeated_fisher_yates(
+    const RNGState<RNG> &state,
+    int64_t vec_nnz,
+    int64_t dim_major,
+    int64_t dim_minor,
+    sint_t *idxs_major,
+    sint_t *idxs_minor,
+    T *vals
+) {
+    randblas_error_if(vec_nnz > dim_major);
+    std::vector<sint_t> vec_work(dim_major);
+    for (sint_t j = 0; j < dim_major; ++j)
+        vec_work[j] = j;
+    std::vector<sint_t> pivots(vec_nnz);
+    RNG gen;
+    auto [ctr, key] = state;
+    for (sint_t i = 0; i < dim_minor; ++i) {
+        sint_t offset = i * vec_nnz;
+        auto ctri = ctr;
+        ctri.incr(offset);
+        for (sint_t j = 0; j < vec_nnz; ++j) {
+            // one step of Fisher-Yates shuffling
+            auto rv = gen(ctri, key);
+            sint_t ell = j + rv[0] % (dim_major - j);
+            pivots[j] = ell;
+            sint_t swap = vec_work[ell];
+            vec_work[ell] = vec_work[j];
+            vec_work[j] = swap;
+            // update (rows, cols, vals)
+            idxs_major[j + offset] = (sint_t) swap;
+            vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
+            idxs_minor[j + offset] = (sint_t) i;
+            // increment counter
+            ctri.incr();
+        }
+        // Restore vec_work for next iteration of Fisher-Yates.
+        //      This isn't necessary from a statistical perspective,
+        //      but it makes it easier to generate submatrices of
+        //      a given SparseSkOp.
+        for (sint_t j = 1; j <= vec_nnz; ++j) {
+            sint_t jj = vec_nnz - j;
+            sint_t swap = idxs_major[jj + offset];
+            sint_t ell = pivots[jj];
+            vec_work[jj] = vec_work[ell];
+            vec_work[ell] = swap;
+        }
+        ctr = ctri;
+    }
+    return RNGState<RNG> {ctr, key};
+}
+}
+
 namespace RandBLAS {
 // =============================================================================
 /// A distribution over sparse matrices.
@@ -110,7 +173,7 @@ struct SparseSkOp {
     // ---------------------------------------------------------------------------
     ///  The state that should be used by the next call to an RNG *after* the
     ///  full sketching operator has been sampled.
-    RNGState<RNG> next_state;
+    const RNGState<RNG> next_state;
 
     // ---------------------------------------------------------------------------
     /// We need workspace to store a representation of the sampled sketching
@@ -178,7 +241,42 @@ struct SparseSkOp {
         sint_t *cols,
         T *vals,
         bool known_filled = true
-    );
+    ) : // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        own_memory(false),
+        next_state(sparse::compute_next_state(dist, seed_state))
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        this->rows = rows;
+        this->cols = cols;
+        this->vals = vals;
+        this->known_filled = known_filled;
+    };
+
+    // Useful for shallow copies (possibly with transposition)
+    SparseSkOp(
+        SparseDist dist,
+        const RNGState<RNG> &seed_state,
+        sint_t *rows,
+        sint_t *cols,
+        T *vals,
+        const RNGState<RNG> &next_state
+    ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(seed_state), next_state(next_state), own_memory(false) {
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        this->rows = rows;
+        this->cols = cols;
+        this->vals = vals;
+        this->known_filled = known_filled;
+    }
 
     SparseSkOp(
         SparseDist dist,
@@ -207,7 +305,29 @@ struct SparseSkOp {
     SparseSkOp(
         SparseDist dist,
         const RNGState<RNG> &state
-    );
+    ) :  // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        next_state(sparse::compute_next_state(dist, seed_state)),
+        own_memory(true)
+    {   // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        randblas_require(this->dist.vec_nnz > 0);
+        // actual work
+        int64_t minor_ax_len;
+        if (this->dist.major_axis == MajorAxis::Short) {
+            minor_ax_len = MAX(this->dist.n_rows, this->dist.n_cols);
+        } else { 
+            minor_ax_len = MIN(this->dist.n_rows, this->dist.n_cols);
+        }
+        int64_t nnz = this->dist.vec_nnz * minor_ax_len;
+        this->rows = new sint_t[nnz];
+        this->cols = new sint_t[nnz];
+        this->vals = new T[nnz];
+    }
 
     SparseSkOp(
         SparseDist dist,
@@ -216,68 +336,12 @@ struct SparseSkOp {
 
 
     //  Destructor
-    ~SparseSkOp();
-};
-
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::SparseSkOp(
-    SparseDist dist,
-    const RNGState<RNG> &state
-) :  // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    own_memory(true)
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    randblas_require(this->dist.vec_nnz > 0);
-    // actual work
-    int64_t minor_ax_len;
-    if (this->dist.major_axis == MajorAxis::Short) {
-        minor_ax_len = MAX(this->dist.n_rows, this->dist.n_cols);
-    } else { 
-        minor_ax_len = MIN(this->dist.n_rows, this->dist.n_cols);
-    }
-    int64_t nnz = this->dist.vec_nnz * minor_ax_len;
-    this->rows = new sint_t[nnz];
-    this->cols = new sint_t[nnz];
-    this->vals = new T[nnz];
-};
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::SparseSkOp(
-    SparseDist dist,
-    const RNGState<RNG> &state,
-    sint_t *rows,
-    sint_t *cols,
-    T *vals,
-    bool known_filled
-) :  // variable definitions
-    n_rows(dist.n_rows),
-    n_cols(dist.n_cols),
-    dist(dist),
-    seed_state(state),
-    own_memory(false)
-{   // sanity checks
-    randblas_require(this->dist.n_rows > 0);
-    randblas_require(this->dist.n_cols > 0);
-    randblas_require(this->dist.vec_nnz > 0);
-    // actual work
-    this->rows = rows;
-    this->cols = cols;
-    this->vals = vals;
-    this->known_filled = known_filled;
-};
-
-template <typename T, typename RNG, SignedInteger sint_t>
-SparseSkOp<T,RNG,sint_t>::~SparseSkOp() {
-    if (this->own_memory) {
-        delete [] this->rows;
-        delete [] this->cols;
-        delete [] this->vals;
+    ~SparseSkOp() {
+        if (this->own_memory) {
+            delete [] this->rows;
+            delete [] this->cols;
+            delete [] this->vals;
+        }
     }
 };
 
@@ -297,7 +361,7 @@ SparseSkOp<T,RNG,sint_t>::~SparseSkOp() {
 ///     algorithm.
 ///     
 template <typename T, typename RNG, SignedInteger sint_t>
-RNGState<RNG> fill_sparse(
+void fill_sparse(
     SparseSkOp<T,RNG,sint_t> & S
 ) {
     int64_t long_ax_len = MAX(S.dist.n_rows, S.dist.n_cols);
@@ -308,18 +372,19 @@ RNGState<RNG> fill_sparse(
     sint_t *long_ax_idxs = (is_wide) ? S.cols : S.rows;
 
     if (S.dist.major_axis == MajorAxis::Short) {
-        S.next_state = repeated_fisher_yates(
+        sparse::repeated_fisher_yates(
             S.seed_state, S.dist.vec_nnz, short_ax_len, long_ax_len,
             short_ax_idxs, long_ax_idxs, S.vals
         );
     } else {
-        S.next_state = repeated_fisher_yates(
+        sparse::repeated_fisher_yates(
             S.seed_state, S.dist.vec_nnz, long_ax_len, short_ax_len,
             long_ax_idxs, short_ax_idxs, S.vals
         );
     }
+    // TODO: add check that S.next_state == output from the repeated_fisher_yates function.
     S.known_filled = true;
-    return S.next_state;
+    return;
 }
 
 template <typename SKOP>
@@ -352,60 +417,6 @@ void print_sparse(SKOP const &S0) {
     std::cout << std::endl;
 }
 
-// =============================================================================
-/// WARNING: this function is not part of the public API.
-///
-template <typename T, typename RNG, SignedInteger sint_t>
-static auto repeated_fisher_yates(
-    const RNGState<RNG> &state,
-    int64_t vec_nnz,
-    int64_t dim_major,
-    int64_t dim_minor,
-    sint_t *idxs_major,
-    sint_t *idxs_minor,
-    T *vals
-) {
-    randblas_error_if(vec_nnz > dim_major);
-    std::vector<sint_t> vec_work(dim_major);
-    for (sint_t j = 0; j < dim_major; ++j)
-        vec_work[j] = j;
-    std::vector<sint_t> pivots(vec_nnz);
-    RNG gen;
-    auto [ctr, key] = state;
-    for (sint_t i = 0; i < dim_minor; ++i) {
-        sint_t offset = i * vec_nnz;
-        auto ctri = ctr;
-        ctri.incr(offset);
-        for (sint_t j = 0; j < vec_nnz; ++j) {
-            // one step of Fisher-Yates shuffling
-            auto rv = gen(ctri, key);
-            sint_t ell = j + rv[0] % (dim_major - j);
-            pivots[j] = ell;
-            sint_t swap = vec_work[ell];
-            vec_work[ell] = vec_work[j];
-            vec_work[j] = swap;
-            // update (rows, cols, vals)
-            idxs_major[j + offset] = (sint_t) swap;
-            vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
-            idxs_minor[j + offset] = (sint_t) i;
-            // increment counter
-            ctri.incr();
-        }
-        // Restore vec_work for next iteration of Fisher-Yates.
-        //      This isn't necessary from a statistical perspective,
-        //      but it makes it easier to generate submatrices of
-        //      a given SparseSkOp.
-        for (sint_t j = 1; j <= vec_nnz; ++j) {
-            sint_t jj = vec_nnz - j;
-            sint_t swap = idxs_major[jj + offset];
-            sint_t ell = pivots[jj];
-            vec_work[jj] = vec_work[ell];
-            vec_work[ell] = swap;
-        }
-        ctr = ctri;
-    }
-    return RNGState<RNG> {ctr, key};
-}
 
 } // end namespace RandBLAS
 
diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index c912b159..eaaf8d74 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -179,6 +179,22 @@ void transpose_square(T* A, int64_t n, int64_t lda) {
     return;
 }
 
+template <typename T>
+void omatcopy(int64_t m, int64_t n, const T* A, int64_t irs_a, int64_t ics_a, T* B, int64_t irs_b, int64_t ics_b) {
+    // TODO:
+    //     1. Order the loops with consideration to cache efficiency.
+    //     2. Vectorize one of the loops with blas::copy or std::memcpy.
+    #define MAT_A(_i, _j) A[(_i)*irs_a + (_j)*ics_a]
+    #define MAT_B(_i, _j) B[(_i)*irs_b + (_j)*ics_b]
+    for (int64_t i = 0; i < m; ++i) {
+        for (int64_t j = 0; j < n; ++j) {
+            MAT_B(i,j) = MAT_A(i,j);
+        }
+    }
+    #undef MAT_A
+    #undef MAT_B
+    return;
+}
 
 template <typename T>
 void flip_layout(blas::Layout layout_in, int64_t m, int64_t n, std::vector<T> &A, int64_t lda_in, int64_t lda_out) {
@@ -207,18 +223,9 @@ void flip_layout(blas::Layout layout_in, int64_t m, int64_t n, std::vector<T> &A
     std::vector<T> A_in(A);
     T* A_buff_in  = A_in.data();
     T* A_buff_out = A.data();
-
-    #define A_IN(_i, _j) A_buff_in[(_i)*irs_in + (_j)*ics_in]
-    #define A_OUT(_i, _j) A_buff_out[(_i)*irs_out + (_j)*ics_out]
-    for (int64_t i = 0; i < m; ++i) {
-        for (int64_t j = 0; j < n; ++j) {
-            A_OUT(i,j) = A_IN(i,j);
-        }
-    }
+    omatcopy(m, n, A_buff_in, irs_in, ics_in, A_buff_out, irs_out, ics_out);
     A.erase(A.begin() + len_buff_A_out, A.end());
     A.resize(len_buff_A_out);
-    #undef A_IN
-    #undef A_OUT
     return;
 }
 

From 1946a1c98193e4938eabc99e53a88c1c1d8947fc Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 13:28:45 -0400
Subject: [PATCH 05/37] cleanup of fill_dense_submat_impl

---
 RandBLAS/dense_skops.hh | 81 ++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index c7478ac3..8a250de0 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -67,6 +67,13 @@ static inline RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
     return RNGState<RNG>(0);
 }
 
+template <typename T_IN, typename T_OUT>
+inline void copy_promote(int n, const T_IN &a, T_OUT* b) {
+    for (int i = 0; i < n; ++i)
+        b[i] = static_cast<T_OUT>(a[i]);
+    return;
+}
+
 /** 
  * Fill buff with random values so it gives a row-major representation of an n_srows \math{\times} n_scols
  * submatrix of some implicitly defined parent matrix.
@@ -127,25 +134,28 @@ static RNGState<RNG> fill_dense_submat_impl(
     using KEY_t = typename RNG::key_type;
     CTR_t c = seed.counter;
     KEY_t k = seed.key;
+    int64_t ctr_size = CTR_t::static_size;
     
     int64_t pad = 0;
     // ^ computed such that  n_cols+pad is divisible by RNG::static_size
-    if (n_cols % CTR_t::static_size != 0) {
-        pad = CTR_t::static_size - n_cols % CTR_t::static_size;
+    if (n_cols % ctr_size != 0) {
+        pad = ctr_size - n_cols % ctr_size;
     }
+    
 
     int64_t n_cols_padded = n_cols + pad;
-    // ^ smallest number of columns, greater than or equal to n_cols, that would be divisible by CTR_t::static_size 
+    // ^ smallest number of columns, greater than or equal to n_cols, that would be divisible by ctr_size 
     int64_t ptr_padded = ptr + ptr / n_cols * pad;
     // ^ ptr corresponding to the padded matrix
-    int64_t r0_padded = ptr_padded / CTR_t::static_size;
+    int64_t ctr_mat_start = ptr_padded / ctr_size;
     // ^ starting counter corresponding to ptr_padded 
-    int64_t r1_padded = (ptr_padded + n_scols - 1) / CTR_t::static_size;
+    int64_t ctr_mat_row_end = (ptr_padded + n_scols - 1) / ctr_size;
     // ^ ending counter corresponding to ptr of the last element of the row
-    int64_t ctr_gap = n_cols_padded / CTR_t::static_size; 
+    int64_t ctr_inter_row_stride = n_cols_padded / ctr_size;
     // ^ number of counters between the first counter of the row to the first counter of the next row;
-    int64_t s0 = ptr_padded % CTR_t::static_size; 
-    int64_t e1 = (ptr_padded + n_scols - 1) % CTR_t::static_size;
+    int64_t s0 = ptr_padded % ctr_size; 
+    int64_t e1 = 1 + (ptr_padded + n_scols - 1) % ctr_size;
+    int64_t multiple_incrs_per_row = (ctr_mat_start < ctr_mat_row_end) ;
 
     int64_t num_thrds = 1;
     #if defined(RandBLAS_HAS_OpenMP)
@@ -154,20 +164,14 @@ static RNGState<RNG> fill_dense_submat_impl(
         num_thrds = omp_get_num_threads();
     }
     #endif
-
-    //Instead of using thrd_arr just initialize ctr_arr to be zero counters;
-    CTR_t *ctr_arr = new CTR_t[num_thrds];
-    for (int i = 0; i < num_thrds; i++) {
-        ctr_arr[i] = c;
-    }
+    CTR_t *ctr_arr = new CTR_t[num_thrds]{c};
 
     #pragma omp parallel firstprivate(c, k)
     {
 
     auto cc = c;
     int64_t prev = 0;
-    int64_t i;
-    int64_t r0, r1;
+    int64_t row_ctr_start, row_ctr_end;
     int64_t ind;
     int64_t thrd = 0;
 
@@ -175,44 +179,39 @@ static RNGState<RNG> fill_dense_submat_impl(
     for (int64_t row = 0; row < n_srows; row++) {
         
         #if defined(RandBLAS_HAS_OpenMP)
-            thrd = omp_get_thread_num();
+        thrd = omp_get_thread_num();
         #endif
 
-        ind = 0;
-        int64_t __r01_offset = safe_signed_int_product(ctr_gap, row);
-        r0 = r0_padded + __r01_offset;
-        r1 = r1_padded + __r01_offset; 
-
-        cc.incr(r0 - prev);
-        prev = r0;
+        int64_t __r01_offset = safe_signed_int_product(ctr_inter_row_stride, row);
+        row_ctr_start = ctr_mat_start   + __r01_offset;
+        row_ctr_end   = ctr_mat_row_end + __r01_offset; 
+    
+        T* smat_curr_row = smat + row*lda;
+        cc.incr(row_ctr_start - prev);
+        prev = row_ctr_start;
         auto rv =  OP::generate(rng, cc, k);
-        int64_t range = (r1 > r0)? CTR_t::static_size - 1 : e1;
-        for (i = s0; i <= range; i++) {
-            smat[ind + row * lda] = rv[i];
-            ind++;
+        int64_t first_block_len = ((multiple_incrs_per_row) ? ctr_size : e1) - s0;
+        // beginning
+        for (int i = 0; i < first_block_len; i++) {
+            smat_curr_row[i] = rv[i+s0];
         }
+        ind = first_block_len;
         // middle 
-        int64_t tmp = r0;
-        while( tmp < r1 - 1) {
+        int64_t implicit_ctr = row_ctr_start;
+        while( implicit_ctr < row_ctr_end - 1) {
             cc.incr();
             prev++;
             rv = OP::generate(rng, cc, k);
-            for (i = 0; i < CTR_t::static_size; i++) {
-                smat[ind + row * lda] = rv[i];
-                ind++;
-            }
-            tmp++;
+            copy_promote(ctr_size, rv, smat_curr_row + ind);
+            ind = ind + ctr_size;
+            implicit_ctr++;
         }
-
         // end
-        if ( r1 > r0 ) {
+        if ( row_ctr_start < row_ctr_end ) {
             cc.incr();
             prev++;
             rv = OP::generate(rng, cc, k);
-            for (i = 0; i <= e1; i++) {
-                smat[ind + row * lda] = rv[i];
-                ind++;
-            }
+            copy_promote(e1, rv, smat_curr_row + ind);
         }
         ctr_arr[thrd] = cc;
     }

From 4dae5a0839bf9068a0927e15faf028fdeaf3bf26 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 14:22:13 -0400
Subject: [PATCH 06/37] clean up fill_dense_submat_impl

---
 RandBLAS/dense_skops.hh | 67 ++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 8a250de0..5d5dd912 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -137,34 +137,35 @@ static RNGState<RNG> fill_dense_submat_impl(
     int64_t ctr_size = CTR_t::static_size;
     
     int64_t pad = 0;
-    // ^ computed such that  n_cols+pad is divisible by RNG::static_size
+    // ^ computed such that  n_cols+pad is divisible by ctr_size
     if (n_cols % ctr_size != 0) {
         pad = ctr_size - n_cols % ctr_size;
     }
     
 
     int64_t n_cols_padded = n_cols + pad;
-    // ^ smallest number of columns, greater than or equal to n_cols, that would be divisible by ctr_size 
+    // ^ pad as necessary in order to be divisible by ctr_size 
     int64_t ptr_padded = ptr + ptr / n_cols * pad;
     // ^ ptr corresponding to the padded matrix
-    int64_t ctr_mat_start = ptr_padded / ctr_size;
-    // ^ starting counter corresponding to ptr_padded 
-    int64_t ctr_mat_row_end = (ptr_padded + n_scols - 1) / ctr_size;
-    // ^ ending counter corresponding to ptr of the last element of the row
+    int64_t ctr_mat_start     = ptr_padded / ctr_size;
+    int64_t first_block_start = ptr_padded % ctr_size;
+    // ^ counter and [position within the counter's array] for index "ptr_padded".
+    int64_t ctr_mat_row_end =  (ptr_padded + n_scols - 1) / ctr_size;
+    int64_t last_block_stop = ((ptr_padded + n_scols - 1) % ctr_size) + 1;
+    // ^ counter and [1 + position within the counter's array] for index "(ptr_padded + n_scols - 1)".
     int64_t ctr_inter_row_stride = n_cols_padded / ctr_size;
-    // ^ number of counters between the first counter of the row to the first counter of the next row;
-    int64_t s0 = ptr_padded % ctr_size; 
-    int64_t e1 = 1 + (ptr_padded + n_scols - 1) % ctr_size;
-    int64_t multiple_incrs_per_row = (ctr_mat_start < ctr_mat_row_end) ;
+    // ^ number of counters between the first counter of a given row to the first counter of the next row;
+    bool  one_block_per_row = ctr_mat_start == ctr_mat_row_end;
+    int64_t first_block_len = ((one_block_per_row) ? last_block_stop : ctr_size) - first_block_start;
 
-    int64_t num_thrds = 1;
+    int64_t num_threads = 1;
     #if defined(RandBLAS_HAS_OpenMP)
     #pragma omp parallel 
     {
-        num_thrds = omp_get_num_threads();
+        num_threads = omp_get_num_threads();
     }
     #endif
-    CTR_t *ctr_arr = new CTR_t[num_thrds]{c};
+    CTR_t *ctr_arr = new CTR_t[num_threads]{c};
 
     #pragma omp parallel firstprivate(c, k)
     {
@@ -172,55 +173,53 @@ static RNGState<RNG> fill_dense_submat_impl(
     auto cc = c;
     int64_t prev = 0;
     int64_t row_ctr_start, row_ctr_end;
-    int64_t ind;
-    int64_t thrd = 0;
+    int64_t thread = 0;
 
     #pragma omp for
     for (int64_t row = 0; row < n_srows; row++) {
         
         #if defined(RandBLAS_HAS_OpenMP)
-        thrd = omp_get_thread_num();
+        thread = omp_get_thread_num();
         #endif
 
         int64_t __r01_offset = safe_signed_int_product(ctr_inter_row_stride, row);
         row_ctr_start = ctr_mat_start   + __r01_offset;
         row_ctr_end   = ctr_mat_row_end + __r01_offset; 
     
-        T* smat_curr_row = smat + row*lda;
         cc.incr(row_ctr_start - prev);
         prev = row_ctr_start;
         auto rv =  OP::generate(rng, cc, k);
-        int64_t first_block_len = ((multiple_incrs_per_row) ? ctr_size : e1) - s0;
-        // beginning
+
+        T* smat_row = smat + row*lda;
         for (int i = 0; i < first_block_len; i++) {
-            smat_curr_row[i] = rv[i+s0];
+            smat_row[i] = rv[i+first_block_start];
         }
-        ind = first_block_len;
-        // middle 
+        if ( one_block_per_row )
+            continue;
+    
+        // middle blocks
+        int64_t ind = first_block_len;
         int64_t implicit_ctr = row_ctr_start;
         while( implicit_ctr < row_ctr_end - 1) {
             cc.incr();
             prev++;
             rv = OP::generate(rng, cc, k);
-            copy_promote(ctr_size, rv, smat_curr_row + ind);
+            copy_promote(ctr_size, rv, smat_row + ind);
             ind = ind + ctr_size;
             implicit_ctr++;
         }
-        // end
-        if ( row_ctr_start < row_ctr_end ) {
-            cc.incr();
-            prev++;
-            rv = OP::generate(rng, cc, k);
-            copy_promote(e1, rv, smat_curr_row + ind);
-        }
-        ctr_arr[thrd] = cc;
+        // last block
+        cc.incr();
+        prev++;
+        rv = OP::generate(rng, cc, k);
+        copy_promote(last_block_stop, rv, smat_row + ind);
+        ctr_arr[thread] = cc;
     }
-
     }
     
-    //finds the largest counter in the counter array
+    // find the largest counter in the counter array
     CTR_t max_c = ctr_arr[0];
-    for (int i = 1; i < num_thrds; i++) {  
+    for (int i = 1; i < num_threads; i++) {  
         if (compare_ctr<RNG>(ctr_arr[i], max_c)) {
             max_c = ctr_arr[i];
         }

From 421197c69c1e45464447d66b000b4a3f04a9cd21 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 17:57:37 -0400
Subject: [PATCH 07/37] heavily revise fill_dense_submat_impl

---
 RandBLAS/dense_skops.hh | 106 ++++++++++++----------------------------
 1 file changed, 30 insertions(+), 76 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 5d5dd912..2bb18869 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -47,19 +47,6 @@
 
 namespace RandBLAS::dense {
 
-template<typename RNG>
-bool compare_ctr(typename RNG::ctr_type c1, typename RNG::ctr_type c2) {
-    int len = c1.size();
-    
-    for (int ind = len - 1; ind >= 0; ind--) {
-        if (c1[ind] > c2[ind]) {
-            return true;
-        } else if (c1[ind] < c2[ind]) {
-            return false;
-        }
-    }
-    return false;
-}
 
 template <typename RNG, typename DD>
 static inline RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
@@ -132,101 +119,68 @@ static RNGState<RNG> fill_dense_submat_impl(
     RNG rng;
     using CTR_t = typename RNG::ctr_type;
     using KEY_t = typename RNG::key_type;
-    CTR_t c = seed.counter;
-    KEY_t k = seed.key;
-    int64_t ctr_size = CTR_t::static_size;
+    const int64_t ctr_size = CTR_t::static_size;
     
     int64_t pad = 0;
-    // ^ computed such that  n_cols+pad is divisible by ctr_size
+    // ^ computed such that n_cols+pad is divisible by ctr_size
     if (n_cols % ctr_size != 0) {
         pad = ctr_size - n_cols % ctr_size;
     }
     
-
-    int64_t n_cols_padded = n_cols + pad;
-    // ^ pad as necessary in order to be divisible by ctr_size 
-    int64_t ptr_padded = ptr + ptr / n_cols * pad;
+    const int64_t ptr_padded = ptr + ptr / n_cols * pad;
     // ^ ptr corresponding to the padded matrix
-    int64_t ctr_mat_start     = ptr_padded / ctr_size;
-    int64_t first_block_start = ptr_padded % ctr_size;
+    const int64_t ctr_mat_start     = ptr_padded / ctr_size;
+    const int64_t first_block_start = ptr_padded % ctr_size;
     // ^ counter and [position within the counter's array] for index "ptr_padded".
-    int64_t ctr_mat_row_end =  (ptr_padded + n_scols - 1) / ctr_size;
-    int64_t last_block_stop = ((ptr_padded + n_scols - 1) % ctr_size) + 1;
+    const int64_t ctr_mat_row_end =  (ptr_padded + n_scols - 1) / ctr_size;
+    const int64_t last_block_stop = ((ptr_padded + n_scols - 1) % ctr_size) + 1;
     // ^ counter and [1 + position within the counter's array] for index "(ptr_padded + n_scols - 1)".
-    int64_t ctr_inter_row_stride = n_cols_padded / ctr_size;
+    const int64_t ctr_inter_row_stride = (n_cols + pad) / ctr_size;
     // ^ number of counters between the first counter of a given row to the first counter of the next row;
-    bool  one_block_per_row = ctr_mat_start == ctr_mat_row_end;
-    int64_t first_block_len = ((one_block_per_row) ? last_block_stop : ctr_size) - first_block_start;
+    const bool  one_block_per_row = ctr_mat_start == ctr_mat_row_end;
+    const int64_t first_block_len = ((one_block_per_row) ? last_block_stop : ctr_size) - first_block_start;
 
-    int64_t num_threads = 1;
-    #if defined(RandBLAS_HAS_OpenMP)
-    #pragma omp parallel 
-    {
-        num_threads = omp_get_num_threads();
-    }
-    #endif
-    CTR_t *ctr_arr = new CTR_t[num_threads]{c};
+    CTR_t temp_c = seed.counter;
+    temp_c.incr(ctr_mat_start);
+    const CTR_t c = temp_c;
+    const KEY_t k = seed.key;
 
-    #pragma omp parallel firstprivate(c, k)
+    #pragma omp parallel
     {
-
-    auto cc = c;
-    int64_t prev = 0;
-    int64_t row_ctr_start, row_ctr_end;
-    int64_t thread = 0;
-
-    #pragma omp for
+    #pragma omp for schedule(static)
     for (int64_t row = 0; row < n_srows; row++) {
-        
-        #if defined(RandBLAS_HAS_OpenMP)
-        thread = omp_get_thread_num();
-        #endif
-
-        int64_t __r01_offset = safe_signed_int_product(ctr_inter_row_stride, row);
-        row_ctr_start = ctr_mat_start   + __r01_offset;
-        row_ctr_end   = ctr_mat_row_end + __r01_offset; 
+
+        int64_t incr_from_c = safe_signed_int_product(ctr_inter_row_stride, row);
     
-        cc.incr(row_ctr_start - prev);
-        prev = row_ctr_start;
-        auto rv =  OP::generate(rng, cc, k);
+        auto c_row = c;
+        c_row.incr(incr_from_c);
+        auto rv = OP::generate(rng, c_row, k);
 
         T* smat_row = smat + row*lda;
         for (int i = 0; i < first_block_len; i++) {
             smat_row[i] = rv[i+first_block_start];
         }
-        if ( one_block_per_row )
+        if ( one_block_per_row ) {
             continue;
-    
+        }
         // middle blocks
         int64_t ind = first_block_len;
-        int64_t implicit_ctr = row_ctr_start;
-        while( implicit_ctr < row_ctr_end - 1) {
-            cc.incr();
-            prev++;
-            rv = OP::generate(rng, cc, k);
+        for (int i = 0; i < (ctr_mat_row_end - ctr_mat_start - 1); ++i) {
+            c_row.incr();
+            rv = OP::generate(rng, c_row, k);
             copy_promote(ctr_size, rv, smat_row + ind);
             ind = ind + ctr_size;
-            implicit_ctr++;
         }
         // last block
-        cc.incr();
-        prev++;
-        rv = OP::generate(rng, cc, k);
+        c_row.incr();
+        rv = OP::generate(rng, c_row, k);
         copy_promote(last_block_stop, rv, smat_row + ind);
-        ctr_arr[thread] = cc;
     }
     }
     
     // find the largest counter in the counter array
-    CTR_t max_c = ctr_arr[0];
-    for (int i = 1; i < num_threads; i++) {  
-        if (compare_ctr<RNG>(ctr_arr[i], max_c)) {
-            max_c = ctr_arr[i];
-        }
-    }
-    delete [] ctr_arr;
-
-    max_c.incr();
+    CTR_t max_c = c;
+    max_c.incr(n_srows * ctr_inter_row_stride);
     return RNGState<RNG> {max_c, k};
 }
 

From ae624fc020130640c9acb0721329b9ae4b3dac83 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 18:44:57 -0400
Subject: [PATCH 08/37] implemented compute_next_state for DenseSkOp and
 SparseSkOp

---
 RandBLAS/dense_skops.hh  | 25 +++++++++++++++++++------
 RandBLAS/sparse_skops.hh | 27 ++++++++++++++++++---------
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 2bb18869..5d1ce130 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -48,12 +48,6 @@
 namespace RandBLAS::dense {
 
 
-template <typename RNG, typename DD>
-static inline RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
-    // Need logic that depends on DenseDistName.
-    return RNGState<RNG>(0);
-}
-
 template <typename T_IN, typename T_OUT>
 inline void copy_promote(int n, const T_IN &a, T_OUT* b) {
     for (int i = 0; i < n; ++i)
@@ -184,6 +178,25 @@ static RNGState<RNG> fill_dense_submat_impl(
     return RNGState<RNG> {max_c, k};
 }
 
+template <typename RNG, typename DD>
+RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
+    if (dist.major_axis == MajorAxis::Undefined) {
+        // implies dist.family = DenseDistName::BlackBox
+        return state;
+    }
+    int64_t major_len = major_axis_length(dist);
+    int64_t minor_len = dist.n_rows + (dist.n_cols - major_len);
+    int64_t ctr_size = RNG::ctr_type::static_size;
+    int64_t pad = 0;
+    if (major_len % ctr_size != 0) {
+        pad = ctr_size - major_len % ctr_size;
+    }
+    int64_t ctr_major_axis_stride = (major_len + pad) / ctr_size;
+    int64_t full_incr = safe_signed_int_product(ctr_major_axis_stride, minor_len);
+    state.counter.incr(full_incr);
+    return state;
+}
+
 } // end namespace RandBLAS::dense
 
 
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index cd2ad3cc..47e27919 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -46,10 +46,6 @@
 
 namespace RandBLAS::sparse {
 
-template <typename RNG, typename SD>
-static RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> seed_state) {
-    return RNGState<RNG>(0);
-}
 
 // =============================================================================
 /// WARNING: this function is not part of the public API.
@@ -73,11 +69,11 @@ static RNGState<RNG> repeated_fisher_yates(
     auto [ctr, key] = state;
     for (sint_t i = 0; i < dim_minor; ++i) {
         sint_t offset = i * vec_nnz;
-        auto ctri = ctr;
-        ctri.incr(offset);
+        auto ctr_work = ctr;
+        ctr_work.incr(offset);
         for (sint_t j = 0; j < vec_nnz; ++j) {
             // one step of Fisher-Yates shuffling
-            auto rv = gen(ctri, key);
+            auto rv = gen(ctr_work, key);
             sint_t ell = j + rv[0] % (dim_major - j);
             pivots[j] = ell;
             sint_t swap = vec_work[ell];
@@ -88,7 +84,7 @@ static RNGState<RNG> repeated_fisher_yates(
             vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
             idxs_minor[j + offset] = (sint_t) i;
             // increment counter
-            ctri.incr();
+            ctr_work.incr();
         }
         // Restore vec_work for next iteration of Fisher-Yates.
         //      This isn't necessary from a statistical perspective,
@@ -101,10 +97,23 @@ static RNGState<RNG> repeated_fisher_yates(
             vec_work[jj] = vec_work[ell];
             vec_work[ell] = swap;
         }
-        ctr = ctri;
     }
     return RNGState<RNG> {ctr, key};
 }
+
+template <typename RNG, typename SD>
+static RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> state) {
+    int64_t minor_len;
+    if (dist.major_axis == MajorAxis::Short) {
+        minor_len = std::min(dist.n_rows, dist.n_cols);
+    } else {
+        minor_len = std::max(dist.n_rows, dist.n_cols);
+    }
+    int64_t full_incr = minor_len * dist.vec_nnz;
+    state.counter.incr(full_incr);
+    return state;
+}
+
 }
 
 namespace RandBLAS {

From d7312902b49b88e67d2507c6381a95f68feca8a7 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 27 Jul 2024 18:59:51 -0400
Subject: [PATCH 09/37] left out

---
 test/CMakeLists.txt                    |  4 +-
 test/test_basic_rng/test_continuous.cc | 82 ++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 test/test_basic_rng/test_continuous.cc

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1ccf04e1..954689cb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -52,9 +52,9 @@ if (GTest_FOUND)
     #####################################################################
 
     add_executable(RandBLAS_stats
-        test_basic_rng/test_continuous.cc
-        test_basic_rng/test_discrete.cc
         test_basic_rng/test_r123.cc
+        test_basic_rng/test_discrete.cc
+        test_basic_rng/test_continuous.cc
     )
     target_link_libraries(RandBLAS_stats RandBLAS GTest::GTest GTest::Main)
     gtest_discover_tests(RandBLAS_stats)
diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
new file mode 100644
index 00000000..d1a32839
--- /dev/null
+++ b/test/test_basic_rng/test_continuous.cc
@@ -0,0 +1,82 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+using RandBLAS::RNGState;
+#include "rng_common.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <set>
+#include <vector>
+#include <gtest/gtest.h>
+
+
+class TestContinuous : public ::testing::Test
+{
+    protected:
+    
+    virtual void SetUp(){};
+
+    virtual void TearDown(){};
+
+    static void kolmogorov_smirnov_tester(
+        // std::vector<int64_t> &samples, std::vector<float> &true_cdf, double critical_value
+    ) {
+        // TODO: WRITE ME
+        
+        // auto num_samples = (int) samples.size();
+        // auto N = (int64_t) true_cdf.size();
+        // std::vector<float> sample_cdf(N, 0.0);
+        // for (int64_t s : samples)
+        //     sample_cdf[s] += 1;
+        // RandBLAS::util::weights_to_cdf(N, sample_cdf.data());
+
+        // for (int i = 0; i < num_samples; ++i) {
+        //     auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]);
+        //     ASSERT_LT(diff, critical_value);
+        // }
+        return;
+    }
+
+    template <typename t>
+    static void run_uneg11() {
+
+    }
+
+    template <typename T>
+    static void run_gaussian() {
+
+    }
+};
+

From d7b9970624765e4d44ccc3440d830e0ed5981d03 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 09:17:01 -0400
Subject: [PATCH 10/37] Resolve issue #105. Remove unused "convenience"
 constructors for DenseSkOp. Breaking API change that removes fill_dense(dist,
 n_rows, n_cols, ro_s, co_s, buff, seed) --- which returned a tuple of Layout
 and RNGState --- and replaces it with fill_dense(layout, dist, n_rows,
 n_cols, ro_s, co_s, buff, seed) --- which returns an RNGState.

---
 RandBLAS/dense_skops.hh                       | 248 ++++++------------
 RandBLAS/random_gen.hh                        |   6 +-
 RandBLAS/skge.hh                              |  18 +-
 RandBLAS/sparse_data/sksp.hh                  |  16 +-
 RandBLAS/sparse_skops.hh                      |   5 -
 rtd/source/index.rst                          |  12 +-
 rtd/source/tutorial/sampling_skops.rst        |  30 +--
 test/test_datastructures/test_denseskop.cc    |   8 +-
 .../test_datastructures/test_spmats/common.hh |   2 +-
 test/test_matmul_cores/linop_common.hh        |   4 +-
 .../test_sketch_symmetric.cc                  |   2 +-
 11 files changed, 107 insertions(+), 244 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 5d1ce130..82528036 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -184,6 +184,8 @@ RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
         // implies dist.family = DenseDistName::BlackBox
         return state;
     }
+    // ^ This is the only place where MajorAxis is actually used to some 
+    //   productive end.
     int64_t major_len = major_axis_length(dist);
     int64_t minor_len = dist.n_rows + (dist.n_cols - major_len);
     int64_t ctr_size = RNG::ctr_type::static_size;
@@ -240,14 +242,10 @@ struct DenseDist {
     const DenseDistName family;
 
     // ---------------------------------------------------------------------------
-    ///  This member indirectly sets the storage order of buffers of
-    ///  sketching operators that are sampled from this distribution.
-    ///
-    ///  We note that the storage order of a DenseSkOp's underlying buffer does not
-    ///  affect whether the operator can be applied to row-major or column-major data.
-    ///  Mismatched data layouts are resolved automatically and 
-    ///  with zero copies inside RandBLAS::sketch_general. Therefore most users need
-    ///  not spend any brain power thinking about how this value should be set.
+    ///  This member affects whether samples from this distribution have their
+    ///  entries filled row-wise or column-wise. While there is no statistical 
+    ///  difference between these two filling orders, there are situations
+    ///  where one order or the other might be preferred.
     ///  
     /// @verbatim embed:rst:leading-slashes
     /// .. dropdown:: *Notes for experts*
@@ -258,10 +256,9 @@ struct DenseDist {
     ///     a larger sketching operator and (2) one of larger operator's dimensions
     ///     cannot be known before the  iterative process starts.
     ///
-    ///     Essentially, column-major storage order lets us
-    ///     stack operators horizontally in a consistent way, while row-major storage order
-    ///     lets us stack operators vertically in a consistent way. The mapping from
-    ///     major_axis to storage order is given in the table below.
+    ///     Essentially, a column-wise fill order lets us stack operators horizontally
+    ///     in a consistent way, while row-wise fill order lets us stack vertically
+    ///     in a consistent way. The mapping from major_axis to fill order is given below.
     /// 
     ///        .. list-table::
     ///           :widths: 34 33 33
@@ -271,11 +268,11 @@ struct DenseDist {
     ///             - :math:`\texttt{major_axis} = \texttt{Long}`
     ///             - :math:`\texttt{major_axis} = \texttt{Short}`
     ///           * - :math:`\texttt{n_rows} > \texttt{n_cols}`
-    ///             - column major
-    ///             - row major
+    ///             - column-wise
+    ///             - row-wise
     ///           * - :math:`\texttt{n_rows} \leq \texttt{n_cols}`
-    ///             - row major
-    ///             - column major
+    ///             - row-wise
+    ///             - column-wise
     /// @endverbatim
     const MajorAxis major_axis;
 
@@ -306,7 +303,7 @@ struct DenseDist {
 
 
 inline blas::Layout dist_to_layout(
-    DenseDist D
+    const DenseDist &D
 ) {
     randblas_require(D.major_axis != MajorAxis::Undefined);
     bool is_wide = D.n_rows < D.n_cols;
@@ -323,7 +320,7 @@ inline blas::Layout dist_to_layout(
 }
 
 inline int64_t major_axis_length(
-    DenseDist D
+    const DenseDist &D
 ) {
     randblas_require(D.major_axis != MajorAxis::Undefined);
     return (D.major_axis == MajorAxis::Long) ? 
@@ -359,9 +356,11 @@ struct DenseSkOp {
     ///  full sketching operator has been sampled.
     const RNGState<RNG> next_state;
 
-    T *buff = nullptr;                         // memory
-    const blas::Layout layout;                 // matrix storage order
-    bool del_buff_on_destruct = false;         // only applies if fill_dense(S) has been called.
+
+    T *buff = nullptr;                      // memory
+    blas::Layout layout;                    // matrix storage order
+    bool del_buff_on_destruct = false;      // only applies if fill_dense(S) has been called.å
+
 
     /////////////////////////////////////////////////////////////////////
     //
@@ -372,8 +371,7 @@ struct DenseSkOp {
     DenseSkOp(
         DenseDist dist,
         RNGState<RNG> const &state,
-        T *buff,
-        blas::Layout forced_layout
+        T *buff
     ) : // variable definitions
         n_rows(dist.n_rows),
         n_cols(dist.n_cols),
@@ -381,7 +379,7 @@ struct DenseSkOp {
         seed_state(state),
         next_state(dense::compute_next_state(dist, state)),
         buff(buff),
-        layout(forced_layout)
+        layout(dist_to_layout(dist))
     {   // sanity checks
         randblas_require(this->dist.n_rows > 0);
         randblas_require(this->dist.n_cols > 0);
@@ -389,45 +387,38 @@ struct DenseSkOp {
             randblas_require(this->buff != nullptr);
     }
 
-    DenseSkOp(DenseDist dist, RNGState<RNG> const &state, T *buff
-    ) : DenseSkOp(dist, state, buff, dist_to_layout(dist)) {}
+    DenseSkOp(
+        int64_t n_rows,
+        int64_t n_cols,
+        DenseDist dist,
+        RNGState<RNG> const &seed_state,
+        RNGState<RNG> const &next_state,
+        T *buff,
+        blas::Layout layout,
+        bool del_buff_on_destruct
+    ) : 
+        n_rows(n_rows), n_cols(n_cols), dist(dist),
+        seed_state(seed_state), next_state(next_state),
+        buff(buff), layout(layout), del_buff_on_destruct(del_buff_on_destruct) { }
 
     ///---------------------------------------------------------------------------
-    /// The preferred constructor for DenseSkOp objects. There are other 
-    /// constructors, but they don't appear in the web documentation.
+    /// Construct a DenseSkOp object, \math{S}.
     ///
     /// @param[in] dist
-    ///     A DenseDist object.
-    ///     - Defines the number of rows and columns in this sketching operator.
-    ///     - Defines the (scalar-valued) distribution of each entry in this sketching operator.
+    ///     DenseDist.
+    ///     - Specifies the dimensions of \math{S}.
+    ///     - Specifies the (scalar) distribution of \math{S}'s entries.
     ///
     /// @param[in] state
-    ///     An RNGState object.
+    ///     RNGState.
     ///     - The RNG will use this as the starting point to generate all 
-    ///       random numbers needed for this sketching operator.
+    ///       random numbers needed for \math{S}.
     ///
     DenseSkOp(
         DenseDist dist,
         RNGState<RNG> const &state
     ) : DenseSkOp(dist, state, nullptr) {};
 
-    //  Convenience constructor (a wrapper)
-    DenseSkOp(
-        DenseDist dist,
-        uint32_t key,
-        T *buff = nullptr
-    ) : DenseSkOp(dist, RNGState<RNG>(key), buff) {};
-
-    //  Convenience constructor (a wrapper)
-    DenseSkOp(
-        DenseDistName family,
-        int64_t n_rows,
-        int64_t n_cols,
-        uint32_t key,
-        T *buff = nullptr,
-        MajorAxis ma = MajorAxis::Long
-    ) : DenseSkOp(DenseDist{n_rows, n_cols, family, ma}, RNGState<RNG>(key), buff) {};
-
     // Destructor
     ~DenseSkOp() {
         if (this->del_buff_on_destruct)
@@ -448,16 +439,22 @@ struct DenseSkOp {
 ///   .. |joff| mathmacro:: \mathtt{j\_off}
 ///
 /// @endverbatim
-/// Fill \math{\buff} so that \math{\mat(\buff)} is a submatrix of
-/// an _implicit_ random sample from \math{\D}.
+/// Fill \math{\buff} so that (1) \math{\mat(\buff)} is a submatrix of
+/// an _implicit_ random sample from \math{\D}, and (2) \math{\mat(\buff)}
+/// is determined by reading from \math{\buff} in "layout" order.
 /// 
 /// If we denote the implicit sample from \math{\D} by \math{S}, then we have
 /// @verbatim embed:rst:leading-slashes
 /// .. math::
 ///     \mat(\buff) = S[\ioff:(\ioff + \nrows),\, \joff:(\joff + \ncols)]
 /// @endverbatim
-/// on exit.
+/// on exit. If the specified layout is different from the natural layout
+/// associated with \math{\D} (via its MajorAxis and dimensions) then we will
+/// allocate extra workspace needed for an out-of-place transposition.
 ///
+/// @param[in] layout
+///     blas::Layout::RowMajor or blas::Layout::ColMajor
+///      - The storage order for \math{\mat(\buff)} on exit.
 /// @param[in] D
 ///      A DenseDist object.
 ///      - A distribution over random matrices of shape (D.n_rows, D.n_cols).
@@ -478,25 +475,13 @@ struct DenseSkOp {
 /// @param[in] buff
 ///     Buffer of type T.
 ///     - Length must be at least \math{\nrows \cdot \ncols}.
-///     - The leading dimension of \math{\mat(\buff)} when reading from \math{\buff}
-///       is either \math{\nrows} or \math{\ncols}, depending on the return value of this function
-///       that indicates row-major or column-major layout.
 /// @param[in] seed
 ///      A CBRNG state
 ///      - Used to define \math{S} as a sample from \math{\D}.
-///
-/// @returns
-///     A std::pair consisting of "layout" and "next_state".
-///     - \math{\buff} must be read in "layout" order 
-///       to recover \math{\mat(\buff)}. This layout is determined
-///       from \math{\D} and cannot be controlled directly.
-///     - If you want to control the layout (rather than be at the mercy
-///       of \math{\D}) then you can call the overload of this function
-///       that takes a layout as its first argument. That overload will
-///       allocate memory internally to perform an out-of-place transpose.
 /// 
 template<typename T, typename RNG>
-std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
+RNGState<RNG> fill_dense(
+    blas::Layout required_layout,
     const DenseDist &D,
     int64_t n_rows,
     int64_t n_cols,
@@ -508,10 +493,10 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
     using RandBLAS::dense::fill_dense_submat_impl;
     randblas_require(D.n_rows >= n_rows + ro_s);
     randblas_require(D.n_cols >= n_cols + co_s);
-    blas::Layout layout = dist_to_layout(D);
+    blas::Layout natural_layout = dist_to_layout(D);
     int64_t ma_len = major_axis_length(D);
     int64_t n_rows_, n_cols_, ptr;
-    if (layout == blas::Layout::ColMajor) {
+    if (natural_layout == blas::Layout::ColMajor) {
         // operate on the transpose in row-major
         n_rows_ = n_cols;
         n_cols_ = n_rows;
@@ -521,14 +506,15 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
         n_cols_ = n_cols;
         ptr = safe_signed_int_product(ro_s, ma_len) + co_s;
     }
+    RNGState<RNG> next_state{};
     switch (D.family) {
         case DenseDistName::Gaussian: {
-            auto next_state_g = fill_dense_submat_impl<T,RNG,r123ext::boxmul>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
-            return std::make_pair(layout, next_state_g);
+            next_state = fill_dense_submat_impl<T,RNG,r123ext::boxmul>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
+            break;
         }
         case DenseDistName::Uniform: {
-            auto next_state_u = fill_dense_submat_impl<T,RNG,r123ext::uneg11>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
-            return std::make_pair(layout, next_state_u);
+            next_state = fill_dense_submat_impl<T,RNG,r123ext::uneg11>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
+            break;
         }
         case DenseDistName::BlackBox: {
             throw std::invalid_argument(std::string("fill_dense cannot be called with the BlackBox distribution."));
@@ -537,83 +523,7 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
             throw std::runtime_error(std::string("Unrecognized distribution."));
         }
     }
-}
-
-
-// =============================================================================
-/// @verbatim embed:rst:leading-slashes
-///
-///   .. |mat|   mathmacro:: \operatorname{mat}
-///   .. |buff|  mathmacro:: \mathtt{buff}
-///   .. |D|     mathmacro:: \mathcal{D}
-///   .. |nrows| mathmacro:: \mathtt{n\_rows}
-///   .. |ncols| mathmacro:: \mathtt{n\_cols}
-///   .. |ioff| mathmacro:: \mathtt{i\_off}
-///   .. |joff| mathmacro:: \mathtt{j\_off}
-///
-/// @endverbatim
-/// Fill \math{\buff} so that (1) \math{\mat(\buff)} is a submatrix of
-/// an _implicit_ random sample from \math{\D}, and (2) \math{\mat(\buff)}
-/// is determined by reading from \math{\buff} in "layout" order.
-/// 
-/// If we denote the implicit sample from \math{\D} by \math{S}, then we have
-/// @verbatim embed:rst:leading-slashes
-/// .. math::
-///     \mat(\buff) = S[\ioff:(\ioff + \nrows),\, \joff:(\joff + \ncols)]
-/// @endverbatim
-/// on exit. If the specified layout is different from the natural layout
-/// associated with \math{\D} (via its MajorAxis and dimensions) then we will
-/// allocate extra workspace needed for an out-of-place transposition.
-///
-/// Here's an example use-case.
-///
-///     Suppose you have a DenseSkOp object "S", and you want to fill a buffer called "work"
-///     in "layout" order, so that its contents match the submatrix of S with dimensions
-///     (rows_subsmat_S, cols_submat_S) whose upper-left corner is offset by (ro_s, co_s)
-///     from the upper-left corner of S. This could be done with the following function call:
-///
-///     fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
-///
-/// @param[in] layout
-///     blas::Layout::RowMajor or blas::Layout::ColMajor
-///      - The storage order for \math{\mat(\buff)} on exit.
-/// @param[in] D
-///      A DenseDist object.
-///      - A distribution over random matrices of shape (D.n_rows, D.n_cols).
-/// @param[in] n_rows
-///      A positive integer.
-///      - The number of rows in \math{\mat(\buff)}.
-/// @param[in] n_cols
-///      A positive integer.
-///      - The number of columns in \math{\mat(\buff)}.
-/// @param[in] ro_s
-///      A nonnegative integer.
-///      - The row offset for \math{\mat(\buff)} as a submatrix of \math{S}. 
-///      - We require that \math{\ioff + \nrows} is at most D.n_rows.
-/// @param[in] co_s
-///      A nonnegative integer.
-///      - The column offset for \math{\mat(\buff)} as a submatrix of \math{S}. 
-///      - We require that \math{\joff + \ncols} is at most D.n_cols.
-/// @param[in] buff
-///     Buffer of type T.
-///     - Length must be at least \math{\nrows \cdot \ncols}.
-/// @param[in] seed
-///      A CBRNG state
-///      - Used to define \math{S} as a sample from \math{\D}.
-/// 
-template<typename T, typename RNG>
-RandBLAS::RNGState<RNG> fill_dense(
-    blas::Layout required_layout,
-    const DenseDist &D,
-    int64_t n_rows,
-    int64_t n_cols,
-    int64_t ro_s,
-    int64_t co_s,
-    T* buff,
-    const RNGState<RNG> &seed
-) {
     int64_t size_mat = n_rows * n_cols;
-    auto [natural_layout, next_state] = fill_dense(D, n_rows, n_cols, ro_s, co_s, buff, seed);
     if (required_layout != natural_layout) {
         T* flip_work = new T[size_mat];
         blas::copy(size_mat, buff, 1, flip_work, 1);
@@ -657,12 +567,12 @@ RandBLAS::RNGState<RNG> fill_dense(
 ///       the caller's responsibility to perform a transpose as needed.
 /// 
 template <typename T, typename RNG>
-std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
+RNGState<RNG> fill_dense(
     const DenseDist &D,
     T *buff,
     const RNGState<RNG> &seed
 ) {
-    return fill_dense(D, D.n_rows, D.n_cols, 0, 0,  buff, seed);
+    return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0,  buff, seed);
 }
 
 // ============================================================================= 
@@ -677,11 +587,6 @@ std::pair<blas::Layout, RandBLAS::RNGState<RNG>> fill_dense(
 ///
 /// @param[in] S
 ///     A DenseSkOp object.
-///
-/// @return
-///     An RNGState object. This is the state that should be used the next 
-///     time the program needs to generate random numbers for use in a randomized
-///     algorithm.
 ///    
 template <typename T, typename RNG>
 void fill_dense(
@@ -689,27 +594,20 @@ void fill_dense(
 ) {
     randblas_require(S.buff == nullptr);
     randblas_require(S.dist.family != DenseDistName::BlackBox);
-    int64_t m = S.dist.n_rows;
-    int64_t n = S.dist.n_cols;
-    int64_t size_S = m * n;
-    blas::Layout nat_layout = dist_to_layout(S.dist);
-    blas::Layout req_layout = S.layout;
-    // ^ Natural and requested layouts
-     S.buff = new T[size_S];
-
-    if (req_layout != nat_layout) {
-        T* work = new T[size_S];
-        fill_dense<T, RNG>(S.dist, work, S.seed_state);
-        auto [irs_nat, ics_nat] = layout_to_strides(nat_layout, m, n);
-        auto [irs_req, ics_req] = layout_to_strides(req_layout, m, n);
-        RandBLAS::util::omatcopy(m, n, work, irs_nat, ics_nat, S.buff, irs_req, ics_req);
-        delete [] work;
-    } else {
-        fill_dense<T, RNG>(S.dist, S.buff, S.seed_state);
-    }
+    S.buff = new T[S.dist.n_rows * S.dist.n_cols];
+    fill_dense(S.dist, S.buff, S.seed_state);
     S.del_buff_on_destruct = true;
-
     return;
 }
 
+template <typename T, typename RNG>
+DenseSkOp<T,RNG> submatrix_as_blackbox(const DenseSkOp<T,RNG> &S, int64_t n_rows, int64_t n_cols, int64_t ro_s, int64_t co_s) {
+    T *buff = new T[n_rows * n_cols];
+    auto dl = dist_to_layout(S.dist);
+    fill_dense(dl, S.dist, n_rows, n_cols, ro_s, co_s, buff, S.seed_state);
+    DenseDist submatrix_dist{n_rows, n_cols, DenseDistName::BlackBox, MajorAxis::Undefined};
+    DenseSkOp<T,RNG> submatrix{n_rows, n_cols, submatrix_dist, S.seed_state, S.next_state, buff, dl, true};
+    return submatrix;
+}
+
 }  // end namespace RandBLAS
diff --git a/RandBLAS/random_gen.hh b/RandBLAS/random_gen.hh
index 714da89c..f502e981 100644
--- a/RandBLAS/random_gen.hh
+++ b/RandBLAS/random_gen.hh
@@ -151,9 +151,9 @@ struct uneg11
      *
      * @tparam RNG a random123 CBRNG type
      *
-     * @param[in] a random123 CBRNG instance used to generate the sequence
-     * @param[in] the CBRNG counter
-     * @param[in] the CBRNG key
+     * @param[in] rng: a random123 CBRNG instance used to generate the sequence
+     * @param[in] c: CBRNG counter
+     * @param[in] k: CBRNG key
      *
      * @returns a std::array<N,T> where N is the CBRNG's ctr_type::static_size
      *          and T is deduced from the RNG's counter element type : float
diff --git a/RandBLAS/skge.hh b/RandBLAS/skge.hh
index 943dba14..316a8863 100644
--- a/RandBLAS/skge.hh
+++ b/RandBLAS/skge.hh
@@ -190,14 +190,8 @@ void lskge3(
 ){
     auto [rows_submat_S, cols_submat_S] = dims_before_op(d, m, opS);
     if (!S.buff) {
-        // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
-        // of that sketching operator, and then call LSKGE3 with that new object.
-        T *work = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
-        DenseDist D_{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, MajorAxis::Undefined};
-        DenseSkOp S_(D_, S.seed_state, work, S.layout);
-        lskge3(layout, opS, opA, d, n, m, alpha, S_, 0, 0, A, lda, beta, B, ldb);
-        delete [] work;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        lskge3(layout, opS, opA, d, n, m, alpha, submat_S, 0, 0, A, lda, beta, B, ldb);
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
@@ -345,12 +339,8 @@ void rskge3(
     if (!S.buff) {
         // We'll make a shallow copy of the sketching operator, take responsibility for filling the memory
         // of that sketching operator, and then call RSKGE3 with that new object.
-        T *work = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.layout, S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, work, S.seed_state);
-        DenseDist D_{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, MajorAxis::Undefined};
-        DenseSkOp S_(D_, S.seed_state, work, S.layout);
-        rskge3(layout, opA, opS, m, d, n, alpha, A, lda, S_, 0, 0, beta, B, ldb);
-        delete [] work;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        rskge3(layout, opA, opS, m, d, n, alpha, A, lda, submat_S, 0, 0, beta, B, ldb);
         return;
     }
     randblas_require( S.dist.n_rows >= rows_submat_S + ro_s );
diff --git a/RandBLAS/sparse_data/sksp.hh b/RandBLAS/sparse_data/sksp.hh
index 6be7912b..8045b4d8 100644
--- a/RandBLAS/sparse_data/sksp.hh
+++ b/RandBLAS/sparse_data/sksp.hh
@@ -166,12 +166,8 @@ void lsksp3(
     // B = op(submat(S)) @ op(submat(A))
     auto [rows_submat_S, cols_submat_S] = dims_before_op(d, m, opS);
     if (!S.buff) {
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        lsksp3(layout, opS, opA, d, n, m, alpha, S_, 0, 0, A, ro_a, co_a, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        lsksp3(layout, opS, opA, d, n, m, alpha, submat_S, 0, 0, A, ro_a, co_a, beta, B, ldb);
         return;
     }
 
@@ -324,12 +320,8 @@ void rsksp3(
 ) {
     auto [rows_submat_S, cols_submat_S] = dims_before_op(n, d, opS);
     if (!S.buff) {
-        T *buff = new T[rows_submat_S * cols_submat_S];
-        fill_dense(S.dist, rows_submat_S, cols_submat_S, ro_s, co_s, buff, S.seed_state);
-        DenseDist D{rows_submat_S, cols_submat_S, DenseDistName::BlackBox, S.dist.major_axis};
-        DenseSkOp S_(D, S.seed_state, buff);
-        rsksp3(layout, opA, opS, m, d, n, alpha, A, ro_a, co_a, S_, 0, 0, beta, B, ldb);
-        delete [] buff;
+        auto submat_S = submatrix_as_blackbox(S, rows_submat_S, cols_submat_S, ro_s, co_s);
+        rsksp3(layout, opA, opS, m, d, n, alpha, A, ro_a, co_a, submat_S, 0, 0, beta, B, ldb);
         return;
     }
     auto [rows_submat_A, cols_submat_A] = dims_before_op(m, n, opA);
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index 47e27919..90b332cf 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -363,11 +363,6 @@ struct SparseSkOp {
 ///
 /// @param[in] S
 ///     SparseSkOp object.
-///
-/// @return
-///     An RNGState object. This is the state that should be used the next 
-///     time the program needs to generate random numbers for a randomized
-///     algorithm.
 ///     
 template <typename T, typename RNG, SignedInteger sint_t>
 void fill_sparse(
diff --git a/rtd/source/index.rst b/rtd/source/index.rst
index e11921f9..16a09d25 100644
--- a/rtd/source/index.rst
+++ b/rtd/source/index.rst
@@ -13,15 +13,17 @@
 RandBLAS: sketching for randomized numerical linear algebra
 ===========================================================
 
-RandBLAS is a C++ library for dimension reduction via random linear transformations.
-These random linear transformations are called *sketching operators*.
-The act of applying a sketching operator -- that is, the act of *sketching* -- is of fundamental importance to randomized numerical linear algebra.
+RandBLAS is a C++ library for randomized linear dimension reduction -- an operation commonly known as *sketching*.
+We built RandBLAS to make it easier to write, debug, and deploy high-performance implementations of sketching-based algorithms.
 
 RandBLAS is efficient, flexible, and reliable.
 It uses CPU-based OpenMP acceleration to apply its sketching operators to dense or sparse data matrices stored in main memory.
+
 All sketches produced by RandBLAS are dense.
-As such, dense data matrices can be sketched with dense or sparse operators, while sparse data matrices can only be sketched with dense operators.
-RandBLAS can be used in distributed environments through its ability to (reproducibly) compute products with *submatrices* of sketching operators.
+If you have RandBLAS and an LAPACK-like library at your disposal, you can implement
+a huge range of shared-memory randomized algorithms for matrix computations.
+RandBLAS can be used in distributed environments through its ability to compute products with *submatrices* of sketching operators, 
+without ever realizing the entire sketching operator in memory.
 
 Learn more by reading our `tutorial <tutorial/index.html>`_ or our `API reference <api_reference/index.html>`_.
 If we've piqued your interest, try RandBLAS yourself!
diff --git a/rtd/source/tutorial/sampling_skops.rst b/rtd/source/tutorial/sampling_skops.rst
index aa0f814e..d101510a 100644
--- a/rtd/source/tutorial/sampling_skops.rst
+++ b/rtd/source/tutorial/sampling_skops.rst
@@ -71,9 +71,9 @@ Formal API docs for the recommended constructors can be found :ref:`here <densed
 Constructing your :math:`N^{\text{th}}` sketching operator, for :math:`N > 1`
 ==============================================================================
 
-Suppose you have an application that requires two independent dense sketching operators,
-:math:`\texttt{S1}` and :math:`\texttt{S2}`, each of size :math:`10000 \times 50`. 
-How should you get your hands on these objects?
+Suppose you have an application that requires two statistically independent dense
+sketching operators, :math:`\texttt{S1}` and :math:`\texttt{S2}`, each of size
+:math:`10000 \times 50`.  How should you get your hands on these objects?
 
 .. warning::
     If you try to construct those sketching operators as follows ...
@@ -88,33 +88,19 @@ How should you get your hands on these objects?
     *then your results would be invalid! Far from being independent,* :math:`\texttt{S1}`
     *and* :math:`\texttt{S2}` *would be equal from a mathematical perspective.*
 
-One correct approach is to first
-fill the entries of :math:`\texttt{S1}`, and then call 
-the constructor for :math:`\texttt{S2}` using :math:`\texttt{S1.next_state}`
-as its RNGState argument:
+One correct approach is to then call the constructor for :math:`\texttt{S2}`
+using :math:`\texttt{S1.next_state}` as its RNGState argument:
 
   .. code:: c++
 
     RandBLAS::RNGState my_state();
     RandBLAS::DenseDist my_dist(10000, 50);
     RandBLAS::DenseSkOp<double> S1(my_dist, my_state);
-    // ^ Defines S1 from a mathematical perspective, but performs no work.
-    //   RandBLAS hasn't made any calls to a CBRNG.
-    RandBLAS::fill_dense(S1);
-    // ^ RandBLAS uses a CBRNG to sample each entry of S1,
-    //   then sets S1.next_state.
+    // ^ Defines S1 from a mathematical perspective. Computes S1.next_state,
+    //   but otherwise performs no work.
     RandBLAS::DenseSkOp<double> S2(my_dist, S1.next_state);
 
-The shortcoming of this approach is that it requires explicitly filling
-:math:`\texttt{S1}` just to define :math:`\texttt{S2}` in a mathematical sense.
-There are two alternative approaches you can use if this shortcoming is 
-significant in your application.
-
-*Alternative 1*. Define one larger sketching operator
-(here, a :math:`10000\times 100` sketching operator) and operate with
-appropriate submatrices of that larger operator when needed.
-
-*Alternative 2*. Declare two RNGState objects from the beginning using
+Another valid approach is to declare two RNGState objects from the beginning using
 different keys, as in the following code:
 
   .. code:: c++
diff --git a/test/test_datastructures/test_denseskop.cc b/test/test_datastructures/test_denseskop.cc
index c7ae4544..1d84619c 100644
--- a/test/test_datastructures/test_denseskop.cc
+++ b/test/test_datastructures/test_denseskop.cc
@@ -116,7 +116,7 @@ class TestDenseMoments : public ::testing::Test {
         // Construct the sketching operator
         RandBLAS::DenseDist D(n_rows, n_cols, dn);
         auto state = RandBLAS::RNGState(key);
-        auto [layout, next_state] = RandBLAS::fill_dense(D, A.data(), state);
+        auto next_state = RandBLAS::fill_dense(D, A.data(), state);
 
         // Compute the entrywise empirical mean and standard deviation.
         T mean = std::accumulate(A.data(), A.data() + size, 0.0) /size;
@@ -433,7 +433,7 @@ class TestStateUpdate : public ::testing::Test
         RandBLAS::DenseDist D(n_rows, n_cols, dn);
 
         auto state = RandBLAS::RNGState(key);
-        auto [layout, next_state] = RandBLAS::fill_dense(D, A.data(), state);
+        auto next_state = RandBLAS::fill_dense(D, A.data(), state);
         RandBLAS::fill_dense(D, B.data(), next_state);
 
         ASSERT_TRUE(!(A == B));
@@ -476,7 +476,7 @@ class TestStateUpdate : public ::testing::Test
         auto state1 = RandBLAS::RNGState(key);
 
         // Concatenates two matrices generated from state and next_state
-        auto [layout, next_state] = RandBLAS::fill_dense(D1, A.data(), state);
+        auto next_state = RandBLAS::fill_dense(D1, A.data(), state);
         RandBLAS::fill_dense(D3, A.data() + (int64_t) (D1.n_rows * D1.n_cols), next_state);
 
         RandBLAS::fill_dense(D2, B.data(), state1);
@@ -504,7 +504,7 @@ class TestStateUpdate : public ::testing::Test
 
         typename RNG::ctr_type c_ref = state_copy.counter;
 
-        auto [layout, final_state] = RandBLAS::fill_dense(D, buff, state);
+        auto final_state = RandBLAS::fill_dense(D, buff, state);
         auto c = final_state.counter;
         int c_len = c.size();
 
diff --git a/test/test_datastructures/test_spmats/common.hh b/test/test_datastructures/test_spmats/common.hh
index 8a7ffa37..c262822e 100644
--- a/test/test_datastructures/test_spmats/common.hh
+++ b/test/test_datastructures/test_spmats/common.hh
@@ -57,7 +57,7 @@ void iid_sparsify_random_dense(
 ) { 
     auto spar = new T[n_rows * n_cols];
     auto dist = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
-    auto [unused, next_state] = RandBLAS::fill_dense(dist, spar, state);
+    auto next_state = RandBLAS::fill_dense(dist, spar, state);
 
     auto temp = new T[n_rows * n_cols];
     auto D_mat = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
diff --git a/test/test_matmul_cores/linop_common.hh b/test/test_matmul_cores/linop_common.hh
index e07c1888..82def271 100644
--- a/test/test_matmul_cores/linop_common.hh
+++ b/test/test_matmul_cores/linop_common.hh
@@ -72,8 +72,8 @@ template <typename T, typename RNG=r123::Philox4x32>
 auto random_matrix(int64_t m, int64_t n, RNGState<RNG> s) {
     std::vector<T> A(m * n);
     DenseDist DA(m, n);
-    auto [layout, next_state] = RandBLAS::fill_dense(DA, A.data(), s);
-    std::tuple<std::vector<T>, Layout, RNGState<RNG>> t{A, layout, next_state};
+    auto next_state = RandBLAS::fill_dense(DA, A.data(), s);
+    std::tuple<std::vector<T>, Layout, RNGState<RNG>> t{A, RandBLAS::dist_to_layout(DA), next_state};
     return t;
 }
 
diff --git a/test/test_matmul_wrappers/test_sketch_symmetric.cc b/test/test_matmul_wrappers/test_sketch_symmetric.cc
index 0b48fc01..04298ae8 100644
--- a/test/test_matmul_wrappers/test_sketch_symmetric.cc
+++ b/test/test_matmul_wrappers/test_sketch_symmetric.cc
@@ -53,7 +53,7 @@ void random_symmetric_mat(int64_t n, T* A, int64_t lda, STATE s) {
     // This function can be interpreted as first generating a random lda-by-lda symmetric matrix
     // whose entries in the upper triangle are iid, then symmetrizing that matrix, then
     // zeroing out all entries outside the leading principal submatrix of order n.
-    RandBLAS::fill_dense({lda, lda}, n, n, 0, 0, A, s);
+    RandBLAS::fill_dense(Layout::ColMajor, {lda, lda}, n, n, 0, 0, A, s);
     RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, A, n, lda);
     return;
 }

From 60613d39c6779b01d9fea271c808c6ac682904fb Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 09:40:14 -0400
Subject: [PATCH 11/37] remove another unnecessary DenseSkOp constructor

---
 RandBLAS/dense_skops.hh               | 35 ++++++++++++---------------
 test/test_matmul_cores/test_lskge3.cc |  6 ++---
 test/test_matmul_cores/test_rskge3.cc |  6 ++---
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 82528036..7524c67f 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -368,25 +368,6 @@ struct DenseSkOp {
     //
     /////////////////////////////////////////////////////////////////////
 
-    DenseSkOp(
-        DenseDist dist,
-        RNGState<RNG> const &state,
-        T *buff
-    ) : // variable definitions
-        n_rows(dist.n_rows),
-        n_cols(dist.n_cols),
-        dist(dist),
-        seed_state(state),
-        next_state(dense::compute_next_state(dist, state)),
-        buff(buff),
-        layout(dist_to_layout(dist))
-    {   // sanity checks
-        randblas_require(this->dist.n_rows > 0);
-        randblas_require(this->dist.n_cols > 0);
-        if (dist.family == DenseDistName::BlackBox)
-            randblas_require(this->buff != nullptr);
-    }
-
     DenseSkOp(
         int64_t n_rows,
         int64_t n_cols,
@@ -417,7 +398,21 @@ struct DenseSkOp {
     DenseSkOp(
         DenseDist dist,
         RNGState<RNG> const &state
-    ) : DenseSkOp(dist, state, nullptr) {};
+    ) : // variable definitions
+        n_rows(dist.n_rows),
+        n_cols(dist.n_cols),
+        dist(dist),
+        seed_state(state),
+        next_state(dense::compute_next_state(dist, state)),
+        buff(nullptr),
+        layout(dist_to_layout(dist)
+    ) {
+        // sanity checks
+        randblas_require(this->dist.n_rows > 0);
+        randblas_require(this->dist.n_cols > 0);
+        if (dist.family == DenseDistName::BlackBox)
+            randblas_require(this->buff != nullptr);
+    };
 
     // Destructor
     ~DenseSkOp() {
diff --git a/test/test_matmul_cores/test_lskge3.cc b/test/test_matmul_cores/test_lskge3.cc
index 921dad05..8384ad21 100644
--- a/test/test_matmul_cores/test_lskge3.cc
+++ b/test/test_matmul_cores/test_lskge3.cc
@@ -51,7 +51,7 @@ class TestLSKGE3 : public ::testing::Test
         blas::Layout layout
     ) {
         DenseDist D(d, m);
-        DenseSkOp<T> S0(D, seed, nullptr);
+        DenseSkOp<T> S0(D, seed);
         if (preallocate)
             RandBLAS::fill_dense(S0);
         test_left_apply_submatrix_to_eye<T>(1.0, S0, d, m, 0, 0, layout, 0.0);
@@ -65,7 +65,7 @@ class TestLSKGE3 : public ::testing::Test
         blas::Layout layout
     ) {
         DenseDist Dt(m, d);
-        DenseSkOp<T> S0(Dt, seed, nullptr);
+        DenseSkOp<T> S0(Dt, seed);
         RandBLAS::fill_dense(S0);
         test_left_apply_transpose_to_eye<T>(S0, layout);
     }
@@ -103,7 +103,7 @@ class TestLSKGE3 : public ::testing::Test
         randblas_require(m0 > m);
         randblas_require(n0 > n);
         DenseDist D(d, m);
-        DenseSkOp<T> S0(D, seed_S0, nullptr);
+        DenseSkOp<T> S0(D, seed_S0);
         test_left_apply_to_submatrix<T>(S0, n, m0, n0, A_ro, A_co, layout);
     }
 
diff --git a/test/test_matmul_cores/test_rskge3.cc b/test/test_matmul_cores/test_rskge3.cc
index 2bd8f949..118a0276 100644
--- a/test/test_matmul_cores/test_rskge3.cc
+++ b/test/test_matmul_cores/test_rskge3.cc
@@ -52,7 +52,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist D(m, d);
-        DenseSkOp<T> S0(D, seed, nullptr);
+        DenseSkOp<T> S0(D, seed);
         if (preallocate)
             RandBLAS::fill_dense(S0);
         test_right_apply_submatrix_to_eye<T>(1.0, S0, m, d, 0, 0, layout, 0.0, 0);
@@ -66,7 +66,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist Dt(d, m);
-        DenseSkOp<T> S0(Dt, seed, nullptr);
+        DenseSkOp<T> S0(Dt, seed);
         test_right_apply_tranpose_to_eye<T>(S0, layout);
     }
 
@@ -99,7 +99,7 @@ class TestRSKGE3 : public ::testing::Test
         Layout layout
     ) {
         DenseDist D(n, d);
-        DenseSkOp<T> S0(D, seed_S0, nullptr);
+        DenseSkOp<T> S0(D, seed_S0);
         test_right_apply_to_submatrix<T>(S0, m, m0, n0, A_ro, A_co, layout);
     }
 

From bf6df16ac263273ebd42273acf2cc0dd4f789977 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 09:53:09 -0400
Subject: [PATCH 12/37] fix example to account for change in
 RandBLAS::fill_dense API

---
 examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
index b1b557ac..bfd04e6f 100644
--- a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
+++ b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
@@ -74,7 +74,7 @@ void iid_sparsify_random_dense(
 ) { 
     auto spar = new T[n_rows * n_cols];
     auto dist = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);
-    auto [unused, next_state] = RandBLAS::fill_dense(dist, spar, state);
+    auto next_state = RandBLAS::fill_dense(dist, spar, state);
 
     auto temp = new T[n_rows * n_cols];
     auto D_mat = RandBLAS::DenseDist(n_rows, n_cols, RandBLAS::DenseDistName::Uniform);

From e2954db40351b8945bfead8e97170d306e3702bc Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 09:54:32 -0400
Subject: [PATCH 13/37] change default compiler optimization level in examples

---
 examples/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a6d36735..686a63ea 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,8 +6,8 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake")
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
-set(CMAKE_CXX_FLAGS "-O3")
-# ^ THAT'S SO IMPORTANT!!!!
+set(CMAKE_CXX_FLAGS "-O1")
+# ^ Doing something other than -O0 is important for good performance.
 #   TODO: just set CMAKE_BUILD_TYPE to Release if it's not already given.
 
 # TODO: do a try-catch pattern for finding OpenMP.

From 98017619058ed4529dc4f0de67bece6d458676bd Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 10:51:10 -0400
Subject: [PATCH 14/37] docstring changes

---
 RandBLAS/dense_skops.hh                      | 26 ++++++++++++++------
 rtd/source/api_reference/skops_and_dists.rst |  2 +-
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 7524c67f..26028570 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -432,20 +432,32 @@ struct DenseSkOp {
 ///   .. |ncols| mathmacro:: \mathtt{n\_cols}
 ///   .. |ioff| mathmacro:: \mathtt{i\_off}
 ///   .. |joff| mathmacro:: \mathtt{j\_off}
+///   .. |layout| mathmacro:: \mathtt{layout}
 ///
 /// @endverbatim
 /// Fill \math{\buff} so that (1) \math{\mat(\buff)} is a submatrix of
 /// an _implicit_ random sample from \math{\D}, and (2) \math{\mat(\buff)}
-/// is determined by reading from \math{\buff} in "layout" order.
+/// is determined by reading from \math{\buff} in \math{\layout} order.
 /// 
 /// If we denote the implicit sample from \math{\D} by \math{S}, then we have
 /// @verbatim embed:rst:leading-slashes
 /// .. math::
 ///     \mat(\buff) = S[\ioff:(\ioff + \nrows),\, \joff:(\joff + \ncols)]
 /// @endverbatim
-/// on exit. If the specified layout is different from the natural layout
-/// associated with \math{\D} (via its MajorAxis and dimensions) then we will
-/// allocate extra workspace needed for an out-of-place transposition.
+/// on exit.
+///
+/// This function is for generating low-level representations of matrices
+/// that are equivalent to a submatrix of a RandBLAS DenseSkOp, but 
+/// without using the DenseSkOp abstraction. This can be useful if you want
+/// to sketch a structured matrix that RandBLAS doesn't support (like a symmetric
+/// matrix whose values are only stored in the upper or lower triangle).
+///
+/// Note that since the entries of \math{\buff} are sampled iid from a common
+/// distribution, the value of \math{\layout} is unlikely to have mathematical significance.
+/// However, the value of \math{\layout} can affect this function's efficiency.
+/// For best efficiency we recommend \math{\layout = \mathtt{dist\_to\_layout}(\D).}
+/// If a different value of \math{\layout} is used, then this function will internally
+/// allocate extra memory for an out-of-place change of storage order.
 ///
 /// @param[in] layout
 ///     blas::Layout::RowMajor or blas::Layout::ColMajor
@@ -476,7 +488,7 @@ struct DenseSkOp {
 /// 
 template<typename T, typename RNG>
 RNGState<RNG> fill_dense(
-    blas::Layout required_layout,
+    blas::Layout layout,
     const DenseDist &D,
     int64_t n_rows,
     int64_t n_cols,
@@ -519,11 +531,11 @@ RNGState<RNG> fill_dense(
         }
     }
     int64_t size_mat = n_rows * n_cols;
-    if (required_layout != natural_layout) {
+    if (layout != natural_layout) {
         T* flip_work = new T[size_mat];
         blas::copy(size_mat, buff, 1, flip_work, 1);
         auto [irs_nat, ics_nat] = layout_to_strides(natural_layout, n_rows, n_cols);
-        auto [irs_req, ics_req] = layout_to_strides(required_layout, n_rows, n_cols);
+        auto [irs_req, ics_req] = layout_to_strides(layout, n_rows, n_cols);
         util::omatcopy(n_rows, n_cols, flip_work, irs_nat, ics_nat, buff, irs_req, ics_req);
         delete [] flip_work;
     }
diff --git a/rtd/source/api_reference/skops_and_dists.rst b/rtd/source/api_reference/skops_and_dists.rst
index dd1ea2ea..72508826 100644
--- a/rtd/source/api_reference/skops_and_dists.rst
+++ b/rtd/source/api_reference/skops_and_dists.rst
@@ -52,6 +52,6 @@ SparseDist and SparseSkOp
 Advanced material
 =================
 
-.. doxygenfunction:: RandBLAS::fill_dense(const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t S_ro, int64_t S_co, T *buff, const RNGState<RNG> &seed)
+.. doxygenfunction:: RandBLAS::fill_dense(blas::Layout layout, const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t S_ro, int64_t S_co, T *buff, const RNGState<RNG> &seed)
    :project: RandBLAS
 

From e8e57512955abebc060e7203c1de6d3fdbdaca85 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 14:37:04 -0400
Subject: [PATCH 15/37] refactor tests for populating DenseSkOp buffers. Add a
 new page to web documentation.

---
 RandBLAS/sparse_skops.hh                      |   2 +-
 RandBLAS/util.hh                              |   5 +-
 rtd/source/api_reference/index.rst            |   1 +
 .../api_reference/index_sampling_utils.rst    |  13 ++
 test/test_basic_rng/test_discrete.cc          |   4 +-
 test/test_datastructures/test_denseskop.cc    | 171 +++++-------------
 6 files changed, 70 insertions(+), 126 deletions(-)
 create mode 100644 rtd/source/api_reference/index_sampling_utils.rst

diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index 90b332cf..8e1d3c3d 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -102,7 +102,7 @@ static RNGState<RNG> repeated_fisher_yates(
 }
 
 template <typename RNG, typename SD>
-static RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> state) {
+RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> state) {
     int64_t minor_len;
     if (dist.major_axis == MajorAxis::Short) {
         minor_len = std::min(dist.n_rows, dist.n_cols);
diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index eaaf8d74..65de44af 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -29,6 +29,7 @@
 
 #pragma once
 
+#include <RandBLAS/base.hh>
 #include <RandBLAS/exceptions.hh>
 #include <blas.hh>
 #include <cstdio>
@@ -260,7 +261,7 @@ static inline TO uneg11_to_uneg01(TI in) {
  */
 template <typename TF, typename int64_t, typename RNG>
 RNGState<RNG> sample_indices_iid(
-    int64_t n, TF* cdf, int64_t k, int64_t* samples, RandBLAS::RNGState<RNG> state
+    int64_t n, TF* cdf, int64_t k, int64_t* samples, RNGState<RNG> state
 ) {
     auto [ctr, key] = state;
     RNG gen;
@@ -287,7 +288,7 @@ RNGState<RNG> sample_indices_iid(
  */
 template <typename int64_t, typename RNG>
 RNGState<RNG> sample_indices_iid_uniform(
-    int64_t n,  int64_t* samples , int64_t k, RandBLAS::RNGState<RNG> state
+    int64_t n,  int64_t k, int64_t* samples, RNGState<RNG> state
 ) {
     auto [ctr, key] = state;
     RNG gen;
diff --git a/rtd/source/api_reference/index.rst b/rtd/source/api_reference/index.rst
index e25e20a8..f0a3da6d 100644
--- a/rtd/source/api_reference/index.rst
+++ b/rtd/source/api_reference/index.rst
@@ -16,4 +16,5 @@ API Reference
     Representing sparse data <sparse_matrices>
     Computing a sketch: sparse data <sketch_sparse>
     Sparse BLAS operations <other_sparse>
+    Utilities for index sampling <index_sampling_utils>
 
diff --git a/rtd/source/api_reference/index_sampling_utils.rst b/rtd/source/api_reference/index_sampling_utils.rst
new file mode 100644
index 00000000..d690ac86
--- /dev/null
+++ b/rtd/source/api_reference/index_sampling_utils.rst
@@ -0,0 +1,13 @@
+
+############################################################
+Utilities for coordinate and index-set sampling
+############################################################
+
+    .. doxygenfunction:: RandBLAS::util::sample_indices_iid_uniform(int64_t n, int64_t k, int64_t* samples, RNGState<RNG> state)
+      :project: RandBLAS
+
+    .. doxygenfunction:: RandBLAS::util::sample_indices_iid(int64_t n, TF* cdf, int64_t k, int64_t* samples, RNGState<RNG> state)
+      :project: RandBLAS
+
+    .. doxygenfunction:: RandBLAS::util::weights_to_cdf(int64_t n, T* w, T error_if_below = -std::numeric_limits<T>::epsilon())
+      :project: RandBLAS
diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc
index 5cf041f1..199ef25e 100644
--- a/test/test_basic_rng/test_discrete.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -53,7 +53,7 @@ class TestSampleIndices : public ::testing::Test
     static void test_iid_uniform_smoke(int64_t N, int64_t k, uint32_t seed) { 
         RNGState state(seed);
         std::vector<int64_t> samples(k, -1);
-        RandBLAS::util::sample_indices_iid_uniform(N, samples.data(), k, state);
+        RandBLAS::util::sample_indices_iid_uniform(N, k, samples.data(), state);
         int64_t* data = samples.data();
         for (int64_t i = 0; i < k; ++i) {
             ASSERT_LT(data[i], N);
@@ -88,7 +88,7 @@ class TestSampleIndices : public ::testing::Test
 
         RNGState state(seed);
         std::vector<int64_t> samples(num_samples, -1);
-        RandBLAS::util::sample_indices_iid_uniform(N, samples.data(), num_samples, state);
+        RandBLAS::util::sample_indices_iid_uniform(N, num_samples, samples.data(), state);
 
         index_set_kolmogorov_smirnov_tester(samples, true_cdf, critical_value);
         return;
diff --git a/test/test_datastructures/test_denseskop.cc b/test/test_datastructures/test_denseskop.cc
index 1d84619c..01ac3355 100644
--- a/test/test_datastructures/test_denseskop.cc
+++ b/test/test_datastructures/test_denseskop.cc
@@ -182,7 +182,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_srows * n_scols];
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0; // used for indexing smat when comparing to rmat
-        T total_error = 0;
         for (int nptr = ptr; nptr < n_cols*(n_rows-n_srows-1); nptr += stride*n_cols) {
             // ^ Loop through various pointer locations.- goes down the random matrix by amount stride.
             RandBLAS::dense::fill_dense_submat_impl<T,RNG,OP>(n_cols, smat, n_srows, n_scols, nptr, seed);
@@ -190,14 +189,13 @@ class TestSubmatGeneration : public ::testing::Test
             for (int i = 0; i<n_srows; i++) {
                 // ^ Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[nptr + i*n_cols + j]); 
+                    ASSERT_EQ(smat[ind], mat[nptr + i*n_cols + j]);
                     ind++;
                 }
             }
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0); //Submatrices are equal if each entry is bitwise the same
     }
 
     template<typename T, typename RNG, typename OP>
@@ -214,7 +212,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_srows * n_scols];
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0; // variable used for indexing smat when comparing to rmat
-        T total_error = 0;
         for (int nptr = ptr; nptr < (n_cols - n_scols - 1); nptr += stride) {
             // ^ Loop through various pointer locations.- goes across the random matrix by amount stride.
             RandBLAS::dense::fill_dense_submat_impl<T,RNG,OP>(n_cols, smat, n_srows, n_scols, nptr, seed);
@@ -222,14 +219,13 @@ class TestSubmatGeneration : public ::testing::Test
             for (int i = 0; i<n_srows; i++) {
                 // ^ Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[nptr + i*n_cols + j]);
+                    ASSERT_EQ(smat[ind], mat[nptr + i*n_cols + j]);
                     ind++;
                 }
             }
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0);
     }
 
     template<typename T, typename RNG, typename OP>
@@ -242,7 +238,6 @@ class TestSubmatGeneration : public ::testing::Test
         T* smat = new T[n_rows * n_cols]{};
         fill_dense_rmat_trunc<T,RNG,OP>(mat, n_rows, n_cols, seed);
         int ind = 0;
-        T total_error = 0;
         int64_t n_scols = 1;
         int64_t n_srows = 1;
         for (int ptr = 0; ptr + n_scols + n_cols*n_srows < n_cols*n_rows; ptr += n_rows+1) { // Loop through the diagonal of the matrix
@@ -251,7 +246,7 @@ class TestSubmatGeneration : public ::testing::Test
             ind = 0;
             for (int i = 0; i<n_srows; i++) { // Loop through entries of the submatrix
                 for (int j = 0; j<n_scols; j++) {
-                    total_error += abs(smat[ind] - mat[ptr + i*n_cols + j]);
+                    ASSERT_EQ(smat[ind], mat[ptr + i*n_cols + j]);
                     ind++;
                 }
             }
@@ -260,7 +255,6 @@ class TestSubmatGeneration : public ::testing::Test
         }
         delete[] mat;
         delete[] smat;
-        EXPECT_EQ(total_error, 0);
     }
 
 };
@@ -384,177 +378,112 @@ class TestFillAxis : public::testing::Test
 
 };
 
-TEST_F(TestFillAxis, long_axis_3x5) {
+TEST_F(TestFillAxis, autotranspose_long_axis_3x5) {
     auto_transpose<float>(3, 5, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_3x5) {
+TEST_F(TestFillAxis, autotranspose_short_axis_3x5) {
     auto_transpose<float>(3, 5, RandBLAS::MajorAxis::Short);
 }
 
-TEST_F(TestFillAxis, long_axis_4x8) {
+TEST_F(TestFillAxis, autotranspose_long_axis_4x8) {
     auto_transpose<float>(4, 8, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_4x8) {
+TEST_F(TestFillAxis, autotranspose_short_axis_4x8) {
     auto_transpose<float>(4, 8, RandBLAS::MajorAxis::Short);
 }
 
-TEST_F(TestFillAxis, long_axis_2x4) {
+TEST_F(TestFillAxis, autotranspose_long_axis_2x4) {
     auto_transpose<float>(2, 4, RandBLAS::MajorAxis::Long);
 }
 
-TEST_F(TestFillAxis, short_axis_2x4) {
+TEST_F(TestFillAxis, autotranspose_short_axis_2x4) {
     auto_transpose<float>(2, 4, RandBLAS::MajorAxis::Short);
 }
 
-
-class TestStateUpdate : public ::testing::Test
+class TestDenseSkOpStates : public ::testing::Test
 {
     protected:
 
-    virtual void SetUp(){};
-
-    virtual void TearDown(){};
-
     template <typename T>
-    static void test_var_mat_gen(
+    static void test_concatenate_along_columns(
         uint32_t key,
         int64_t n_rows,
         int64_t n_cols,
         RandBLAS::DenseDistName dn
     ) {
-        // Allocate workspace
+        randblas_require(n_rows > n_cols);
+        RandBLAS::DenseDist D1(    n_rows, n_cols/2,          dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::DenseDist D2(    n_rows, n_cols - n_cols/2, dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::DenseDist Dfull( n_rows, n_cols,            dn, RandBLAS::MajorAxis::Long);
+        RandBLAS::RNGState state(key);
         int64_t size = n_rows * n_cols;
-        std::vector<T> A(size, 0.0);
-        std::vector<T> B(size, 0.0);
-
-        // Construct the sketching operator
-        RandBLAS::DenseDist D(n_rows, n_cols, dn);
-
-        auto state = RandBLAS::RNGState(key);
-        auto next_state = RandBLAS::fill_dense(D, A.data(), state);
-        RandBLAS::fill_dense(D, B.data(), next_state);
-
-        ASSERT_TRUE(!(A == B));
-    }
-
-    template <typename T>
-    static void test_identity(
-        uint32_t key,
-        int64_t n_rows,
-        int64_t n_cols,
-        RandBLAS::DenseDistName dn
-    ) {
-        // Allocate workspace
-        int64_t size = n_rows * n_cols;
-        std::vector<T> A(size, 0.0);
-        std::vector<T> B(size, 0.0);
-
-        double total = 0.0;
-
-        // Construct the sketching operator
-        RandBLAS::DenseDist D1(
-            n_rows,
-            n_cols / 2,
-            dn
-        );
-
-        RandBLAS::DenseDist D3(
-            n_rows,
-            n_cols - n_cols / 2,
-            dn
-        );
-
-        RandBLAS::DenseDist D2(
-            n_rows,
-            n_cols,
-            dn
-        );
-
-        auto state = RandBLAS::RNGState(key);
-        auto state1 = RandBLAS::RNGState(key);
 
         // Concatenates two matrices generated from state and next_state
-        auto next_state = RandBLAS::fill_dense(D1, A.data(), state);
-        RandBLAS::fill_dense(D3, A.data() + (int64_t) (D1.n_rows * D1.n_cols), next_state);
-
-        RandBLAS::fill_dense(D2, B.data(), state1);
+        std::vector<T> A(size, 0.0);
+        RandBLAS::DenseSkOp<T> S1(D1, state);
+        RandBLAS::DenseSkOp<T> S2(D2, S1.next_state);
+        RandBLAS::fill_dense(S1);
+        RandBLAS::fill_dense(S2);
+        int64_t size_d1 = n_rows * D1.n_cols;
+        blas::copy(size_d1, S1.buff, 1, A.data(), 1);
+        int64_t size_d2 = n_rows * D2.n_cols;
+        blas::copy(size_d2, S2.buff, 1, A.data() + size_d1, 1);
+
+        RandBLAS::DenseSkOp<T> S_concat(Dfull, state);
+        RandBLAS::fill_dense(S_concat);
 
         for (int i = 0; i < size; i++) {
-            total += abs(A[i] - B[i]);
+            ASSERT_EQ(A[i], S_concat.buff[i]);
         }
-        
-        ASSERT_TRUE(total == 0.0);
     }
 
     template<typename RNG>
-    static void test_finalstate(
+    static void test_compute_next_state(
         uint32_t key,
         int64_t n_rows,
         int64_t n_cols,
         RandBLAS::DenseDistName dn
     ) {
-        int total = 0;
-        int *buff = new int[n_rows*n_cols];
-        auto state = RandBLAS::RNGState(key);
-        auto state_copy = RandBLAS::RNGState(key);
+        float *buff = new float[n_rows*n_cols];
+        RandBLAS::RNGState state(key);
 
         RandBLAS::DenseDist D(n_rows, n_cols, dn);
 
-        typename RNG::ctr_type c_ref = state_copy.counter;
-
-        auto final_state = RandBLAS::fill_dense(D, buff, state);
-        auto c = final_state.counter;
-        int c_len = c.size();
+        auto actual_final_state = RandBLAS::fill_dense(D, buff, state);
+        auto actual_c = actual_final_state.counter;
 
-        int64_t pad = 0; //Pad computed such that  n_cols+pad is divisible by RNG::static_size
-        if (n_rows % RNG::ctr_type::static_size != 0) {
-            pad = RNG::ctr_type::static_size - n_rows % RNG::ctr_type::static_size;
-        }
-        int64_t last_ptr = n_rows-1 + (n_cols-1)*n_rows;
-        int64_t last_ptr_padded = last_ptr + last_ptr/n_rows * pad;
-        c_ref.incr(last_ptr_padded / RNG::ctr_type::static_size + 1); 
+        auto expect_final_state = RandBLAS::dense::compute_next_state(D, state);
+        auto expect_c = expect_final_state.counter;
 
-        for (int i = 0; i < c_len; i++) {
-            total += c[i] - c_ref[i];
+        for (int i = 0; i < RNG::ctr_type::static_size; i++) {
+            ASSERT_EQ(actual_c[i], expect_c[i]);
         }
 
-        ASSERT_TRUE(total == 0);
         delete [] buff;
     }
     
 };
 
-// For small matrix sizes, mean and stddev are not very close to desired vals.
-TEST_F(TestStateUpdate, Gaussian_var_gen)
-{
-    for (uint32_t key : {0, 1, 2}) {
-        auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_var_mat_gen<double>(key, 100, 50, dn);
-    }
-}
-
-TEST_F(TestStateUpdate, Gaussian_identity)
-{
+TEST_F(TestDenseSkOpStates, concat_tall_with_long_major_axis) {
     for (uint32_t key : {0, 1, 2}) {
         auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_identity<double>(key, 13, 7, dn);
-        test_identity<double>(key, 80, 40, dn);
-        test_identity<double>(key, 83, 41, dn);
-        test_identity<double>(key, 91, 43, dn);
-        test_identity<double>(key, 97, 47, dn);
+        test_concatenate_along_columns<double>(key, 13, 7, dn);
+        test_concatenate_along_columns<double>(key, 80, 40, dn);
+        test_concatenate_along_columns<double>(key, 83, 41, dn);
+        test_concatenate_along_columns<double>(key, 91, 43, dn);
+        test_concatenate_along_columns<double>(key, 97, 47, dn);
     }
 }
 
-TEST_F(TestStateUpdate, Final_State)
-{
+TEST_F(TestDenseSkOpStates, compare_skopless_fill_dense_to_compute_next_state) {
     for (uint32_t key : {0, 1, 2}) {
         auto dn = RandBLAS::DenseDistName::Gaussian;
-        test_finalstate<r123::Philox4x32>(key, 13, 7, dn);
-        test_finalstate<r123::Philox4x32>(key, 11, 5, dn);
-        test_finalstate<r123::Philox4x32>(key, 131, 71, dn);
-        test_finalstate<r123::Philox4x32>(key, 80, 40, dn);
-        test_finalstate<r123::Philox4x32>(key, 91, 43, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 13, 7, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 11, 5, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 131, 71, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 80, 40, dn);
+        test_compute_next_state<r123::Philox4x32>(key, 91, 43, dn);
     }
 }

From 69cdf6f553379b9ea207a37d44f60bcca1a1385a Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 2 Aug 2024 14:39:09 -0400
Subject: [PATCH 16/37] copyright statement in web docs

---
 rtd/source/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtd/source/conf.py b/rtd/source/conf.py
index 32b467c3..4b35eeb1 100644
--- a/rtd/source/conf.py
+++ b/rtd/source/conf.py
@@ -21,8 +21,8 @@
 # -- Project information -----------------------------------------------------
 
 project = 'RandBLAS'
-copyright = "2023, Riley J. Murray, Burlen Loring"
-author = "Riley J. Murray, Burlen Loring"
+copyright = "2024, Riley Murray, Burlen Loring"
+author = "Riley Murray, Burlen Loring"
 
 # -- General configuration ---------------------------------------------------
 

From 8d992a498e22f2e6a17f8382f146377ca5cc6517 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 3 Aug 2024 17:05:17 -0400
Subject: [PATCH 17/37] add Kolmogorov-Smirnov tests for uniform distribution
 over [-1, 1]. Fix small but substantial bug in sample_size_rep. Fix bounds
 error in KS test for sampling indices.

---
 RandBLAS/dense_skops.hh                |   2 +-
 test/test_basic_rng/rng_common.hh      |  19 +++-
 test/test_basic_rng/test_continuous.cc | 115 +++++++++++++++++++------
 test/test_basic_rng/test_discrete.cc   |   9 +-
 4 files changed, 112 insertions(+), 33 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 26028570..4bec8153 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -579,7 +579,7 @@ RNGState<RNG> fill_dense(
     T *buff,
     const RNGState<RNG> &seed
 ) {
-    return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0,  buff, seed);
+    return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed);
 }
 
 // ============================================================================= 
diff --git a/test/test_basic_rng/rng_common.hh b/test/test_basic_rng/rng_common.hh
index e344d93c..ea1974f1 100644
--- a/test/test_basic_rng/rng_common.hh
+++ b/test/test_basic_rng/rng_common.hh
@@ -140,7 +140,7 @@ inline int sample_size_rep(int n) {
     randblas_require(SMALLEST_SAMPLE <= n && n <= LARGEST_SAMPLE);
     int num_sample_sizes = (int) sample_sizes.size();
     for (int i = 0; i < num_sample_sizes; ++i) {
-        if (sample_sizes[i] <= n)
+        if (sample_sizes[i] >= n)
             return i;
     }
     // This code shouldn't be reachable!
@@ -223,4 +223,21 @@ inline double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
     return dD * t1 * t2 * t3;
 }
 
+// MARK: continuous distributions
+
+template <typename T>
+inline T standard_normal_cdf(T x) {
+    double dx = (double) x;
+    return (T) std::erfc(-dx / std::sqrt(2)) / 2;
+}
+
+template <typename T>
+inline T uniform_neg11_cdf(T x) {
+    if (x <= -1)
+        return 0;
+    if (x >= 1)
+        return 1;
+    return (x + 1) / 2;
+}
+
 } // end namespace RandBLAS_StatTests
diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
index d1a32839..cf2ee41e 100644
--- a/test/test_basic_rng/test_continuous.cc
+++ b/test/test_basic_rng/test_continuous.cc
@@ -30,7 +30,9 @@
 #include "RandBLAS/config.h"
 #include "RandBLAS/base.hh"
 #include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
 using RandBLAS::RNGState;
+using RandBLAS::DenseDistName;
 #include "rng_common.hh"
 
 #include <algorithm>
@@ -40,43 +42,102 @@ using RandBLAS::RNGState;
 #include <set>
 #include <vector>
 #include <gtest/gtest.h>
+#include <stdexcept>
 
 
-class TestContinuous : public ::testing::Test
-{
-    protected:
-    
-    virtual void SetUp(){};
 
-    virtual void TearDown(){};
+class TestContinuous : public ::testing::Test {
+    protected:
 
+    // This is really for distributions whose CDFs "F" are continuous and strictly increasing
+    // on the interval [a, b] where F(a) = 0 and F(b) = 1.
+ 
+    template <typename T>
     static void kolmogorov_smirnov_tester(
-        // std::vector<int64_t> &samples, std::vector<float> &true_cdf, double critical_value
-    ) {
-        // TODO: WRITE ME
-        
-        // auto num_samples = (int) samples.size();
-        // auto N = (int64_t) true_cdf.size();
-        // std::vector<float> sample_cdf(N, 0.0);
-        // for (int64_t s : samples)
-        //     sample_cdf[s] += 1;
-        // RandBLAS::util::weights_to_cdf(N, sample_cdf.data());
+        std::vector<T> &samples, double critical_value, DenseDistName dn
+    ) { 
+        auto F_true = [dn](T x) {
+            if (dn == DenseDistName::Gaussian) {
+                return RandBLAS_StatTests::standard_normal_cdf(x);
+            } else if (dn == DenseDistName::Uniform) {
+                return RandBLAS_StatTests::uniform_neg11_cdf(x);
+            } else {
+                std::string msg = "Unrecognized distributions name";
+                throw std::runtime_error(msg);
+            }
+        };
+        auto N = (int) samples.size();
+        /** 
+         *  Let L(x) = |F_empirical(x) - F_true(x)|. The KS test testatistic is
+         *
+         *      ts = sup_{all x} L(x).
+         * 
+         *  Now set s = sorted(samples), and partition the real line into
+         * 
+         *      I_0     = (-infty,  s[0  ]),  ...
+         *      I_1     = [s[0  ],  s[1  ]),  ...
+         *      I_2     = [s[1  ],  s[2  ]),  ...
+         *      I_{N-1} = [s[N-2],  s[N-1]),  ...
+         *      I_N     = [s[N-1],  +infty).
+         * 
+         *  Then, provided F_true is continuous, we have 
+         * 
+         *      sup{ L(x) : x in I_j } = max{ 
+         *              |F_true(inf(I_j)) - j/N|, |F_true(sup(I_j)) - j/N|
+         *      }
+         * 
+         *  for j = 0, ..., N.
+         */
+        samples.push_back( std::numeric_limits<T>::infinity());
+        samples.push_back(-std::numeric_limits<T>::infinity());
+        std::sort(samples.begin(), samples.end(), [](T a, T b) {return (a < b);});
+        for (int64_t j = 0; j <= N; ++j) {
+            T temp1 = F_true(samples[j    ]);
+            T temp2 = F_true(samples[j + 1]);
+            T empirical = ((T)j)/((T)N);
+            T val1 = std::abs(temp1 - empirical);
+            T val2 = std::abs(temp2 - empirical);
+            T supLx_on_Ij = std::max(val1, val2);
+            ASSERT_LE(supLx_on_Ij, critical_value) 
+                << "\nj = " << j << " of N = " << N
+                << "\nF_true(inf(I_j)) = " << temp1 
+                << "\nF_true(sup(I_j)) = " << temp2 
+                << "\nI_j = [" << samples[j] << ", " << samples[j+1] << ")";
+        }
+        return;
+    }
 
-        // for (int i = 0; i < num_samples; ++i) {
-        //     auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]);
-        //     ASSERT_LT(diff, critical_value);
-        // }
+    template <typename T>
+    static void run(double significance, int64_t num_samples, DenseDistName dn, uint32_t seed) {
+        using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
+        auto critical_value = critical_value_rep_mutator(num_samples, significance);
+        RNGState state(seed);
+        std::vector<T> samples(num_samples, -1);
+        RandBLAS::fill_dense({num_samples, 1, dn, RandBLAS::MajorAxis::Long}, samples.data(), state);
+        kolmogorov_smirnov_tester(samples, critical_value, dn);
         return;
     }
+};
 
-    template <typename t>
-    static void run_uneg11() {
+TEST_F(TestContinuous, uneg11_ks_generous) {
+    double s = 1e-6;
+    run<double>(s, 100000, DenseDistName::Uniform, 0);
+    run<double>(s, 10000,  DenseDistName::Uniform, 0);
+    run<double>(s, 1000,   DenseDistName::Uniform, 0);
+}
 
-    }
 
-    template <typename T>
-    static void run_gaussian() {
+TEST_F(TestContinuous, uneg11_ks_moderate) {
+    double s = 1e-4;
+    run<float>(s, 100000, DenseDistName::Uniform, 0);
+    run<float>(s, 10000,  DenseDistName::Uniform, 0);
+    run<float>(s, 1000,   DenseDistName::Uniform, 0); // might fail?
+}
 
-    }
-};
 
+TEST_F(TestContinuous, uneg11_ks_skeptical) {
+    double s = 1e-2;
+    run<float>(s, 100000, DenseDistName::Uniform, 0);
+    run<float>(s, 10000,  DenseDistName::Uniform, 0);
+    run<float>(s, 1000,   DenseDistName::Uniform, 0);
+}
diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc
index 199ef25e..b9d7ae70 100644
--- a/test/test_basic_rng/test_discrete.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -65,15 +65,16 @@ class TestSampleIndices : public ::testing::Test
     static void index_set_kolmogorov_smirnov_tester(
         std::vector<int64_t> &samples, std::vector<float> &true_cdf, double critical_value
     ) {
-        auto num_samples = (int) samples.size();
         auto N = (int64_t) true_cdf.size();
         std::vector<float> sample_cdf(N, 0.0);
         for (int64_t s : samples)
             sample_cdf[s] += 1;
         RandBLAS::util::weights_to_cdf(N, sample_cdf.data());
 
-        for (int i = 0; i < num_samples; ++i) {
-            auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]);
+        for (int i = 0; i < N; ++i) {
+            float F_empirical = sample_cdf[i];
+            float F_true = true_cdf[i];
+            auto diff = (double) std::abs(F_empirical - F_true);
             ASSERT_LT(diff, critical_value);
         }
         return;
@@ -94,7 +95,7 @@ class TestSampleIndices : public ::testing::Test
         return;
     }
 
-    static void test_iid_kolmogorov_smirnov(int64_t N, int exponent, double significance, int64_t num_samples, uint32_t seed) {
+    static void test_iid_kolmogorov_smirnov(int64_t N, float exponent, double significance, int64_t num_samples, uint32_t seed) {
         using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
         auto critical_value = critical_value_rep_mutator(num_samples, significance);
 

From 3820e3e6bcf135aa226a6dc6e8f0cf4cf13fa906 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 3 Aug 2024 18:53:05 -0400
Subject: [PATCH 18/37] KS tests for the Gaussian distribution

---
 test/test_basic_rng/test_continuous.cc | 35 +++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
index cf2ee41e..80e0b7b2 100644
--- a/test/test_basic_rng/test_continuous.cc
+++ b/test/test_basic_rng/test_continuous.cc
@@ -121,23 +121,46 @@ class TestContinuous : public ::testing::Test {
 
 TEST_F(TestContinuous, uneg11_ks_generous) {
     double s = 1e-6;
-    run<double>(s, 100000, DenseDistName::Uniform, 0);
-    run<double>(s, 10000,  DenseDistName::Uniform, 0);
-    run<double>(s, 1000,   DenseDistName::Uniform, 0);
+    for (uint32_t i = 999; i < 1011; ++i) {
+        run<double>(s, 100000, DenseDistName::Uniform, i);
+        run<double>(s, 10000,  DenseDistName::Uniform, i*i);
+        run<double>(s, 1000,   DenseDistName::Uniform, i*i*i);
+    }
 }
 
-
 TEST_F(TestContinuous, uneg11_ks_moderate) {
     double s = 1e-4;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
-    run<float>(s, 1000,   DenseDistName::Uniform, 0); // might fail?
+    run<float>(s, 1000,   DenseDistName::Uniform, 0);
 }
 
-
 TEST_F(TestContinuous, uneg11_ks_skeptical) {
     double s = 1e-2;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
     run<float>(s, 1000,   DenseDistName::Uniform, 0);
 }
+
+TEST_F(TestContinuous, guassian_ks_generous) {
+    double s = 1e-6;
+    for (uint32_t i = 99; i < 103; ++i) {
+        run<double>(s, 100000, DenseDistName::Gaussian, i);
+        run<double>(s, 10000,  DenseDistName::Gaussian, i*i);
+        run<double>(s, 1000,   DenseDistName::Gaussian, i*i*i);
+    }
+}
+
+TEST_F(TestContinuous, guassian_ks_moderate) {
+    double s = 1e-4;
+    run<float>(s, 100000, DenseDistName::Gaussian, 0);
+    run<float>(s, 10000,  DenseDistName::Gaussian, 0);
+    run<float>(s, 1000,   DenseDistName::Gaussian, 0);
+}
+
+TEST_F(TestContinuous, guassian_ks_skeptical) {
+    double s = 1e-2;
+    run<float>(s, 100000, DenseDistName::Gaussian, 0);
+    run<float>(s, 10000,  DenseDistName::Gaussian, 0);
+    run<float>(s, 1000,   DenseDistName::Gaussian, 0);
+}

From d7f6c3f9bba2449c7ed7f7fdbddc951c0343f2e5 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 3 Aug 2024 18:58:22 -0400
Subject: [PATCH 19/37] rename test class;

---
 test/test_basic_rng/test_continuous.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
index 80e0b7b2..398f08c7 100644
--- a/test/test_basic_rng/test_continuous.cc
+++ b/test/test_basic_rng/test_continuous.cc
@@ -46,7 +46,7 @@ using RandBLAS::DenseDistName;
 
 
 
-class TestContinuous : public ::testing::Test {
+class TestScalarDistributions : public ::testing::Test {
     protected:
 
     // This is really for distributions whose CDFs "F" are continuous and strictly increasing
@@ -119,7 +119,7 @@ class TestContinuous : public ::testing::Test {
     }
 };
 
-TEST_F(TestContinuous, uneg11_ks_generous) {
+TEST_F(TestScalarDistributions, uneg11_ks_generous) {
     double s = 1e-6;
     for (uint32_t i = 999; i < 1011; ++i) {
         run<double>(s, 100000, DenseDistName::Uniform, i);
@@ -128,21 +128,21 @@ TEST_F(TestContinuous, uneg11_ks_generous) {
     }
 }
 
-TEST_F(TestContinuous, uneg11_ks_moderate) {
+TEST_F(TestScalarDistributions, uneg11_ks_moderate) {
     double s = 1e-4;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
     run<float>(s, 1000,   DenseDistName::Uniform, 0);
 }
 
-TEST_F(TestContinuous, uneg11_ks_skeptical) {
+TEST_F(TestScalarDistributions, uneg11_ks_skeptical) {
     double s = 1e-2;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
     run<float>(s, 1000,   DenseDistName::Uniform, 0);
 }
 
-TEST_F(TestContinuous, guassian_ks_generous) {
+TEST_F(TestScalarDistributions, guassian_ks_generous) {
     double s = 1e-6;
     for (uint32_t i = 99; i < 103; ++i) {
         run<double>(s, 100000, DenseDistName::Gaussian, i);
@@ -151,14 +151,14 @@ TEST_F(TestContinuous, guassian_ks_generous) {
     }
 }
 
-TEST_F(TestContinuous, guassian_ks_moderate) {
+TEST_F(TestScalarDistributions, guassian_ks_moderate) {
     double s = 1e-4;
     run<float>(s, 100000, DenseDistName::Gaussian, 0);
     run<float>(s, 10000,  DenseDistName::Gaussian, 0);
     run<float>(s, 1000,   DenseDistName::Gaussian, 0);
 }
 
-TEST_F(TestContinuous, guassian_ks_skeptical) {
+TEST_F(TestScalarDistributions, guassian_ks_skeptical) {
     double s = 1e-2;
     run<float>(s, 100000, DenseDistName::Gaussian, 0);
     run<float>(s, 10000,  DenseDistName::Gaussian, 0);

From b688e53327de9708efbc212c2263225a94f38bcf Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 09:03:11 -0400
Subject: [PATCH 20/37] add isometry_scale_factor functions

---
 RandBLAS/dense_skops.hh  | 15 +++++++++++++++
 RandBLAS/sparse_skops.hh | 13 ++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 4bec8153..97a81601 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -327,6 +327,21 @@ inline int64_t major_axis_length(
         std::max(D.n_rows, D.n_cols) : std::min(D.n_rows, D.n_cols);
 }
 
+template <typename T>
+inline T isometry_scale_factor(DenseDist D) {
+    T common = std::pow((T) std::min(D.n_rows, D.n_cols), -0.5);
+    if (D.family == DenseDistName::Gaussian) {
+        return common;
+    } else if (D.family == DenseDistName::Uniform) {
+        // the variance of an r.v. distributed Unif[-1, 1] is 4/12=1/3.
+        return 3*common;
+    } else {
+        throw std::runtime_error("Unrecognized distribution.");
+    }
+}
+
+
+
 // =============================================================================
 /// A sample from a distribution over dense sketching operators.
 ///
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index 8e1d3c3d..997d11d1 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -153,7 +153,18 @@ struct SparseDist {
     const MajorAxis major_axis = MajorAxis::Short;
 };
 
-using RandBLAS::SignedInteger;
+template <typename T>
+inline T isometry_scale_factor(SparseDist D) {
+    T vec_nnz = (T) D.vec_nnz;
+    if (D.major_axis == MajorAxis::Short) {
+        return std::pow(vec_nnz, -0.5); 
+    } else {
+        T minor_ax_len = (T) std::min(D.n_rows, D.n_cols);
+        T major_ax_len = (T) std::max(D.n_rows, D.n_cols);
+        return std::sqrt( major_ax_len / (vec_nnz * minor_ax_len) );
+    }
+}
+
 
 // =============================================================================
 /// A sample from a prescribed distribution over sparse matrices.

From bd042c047afa82767d7d3e052bc8f327793e7cd0 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 09:03:43 -0400
Subject: [PATCH 21/37] start of infrastructure for distortion tests

---
 test/handrolled_lapack.hh              |  54 +++++++++++++
 test/test_basic_rng/test_distortion.cc | 108 +++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 test/handrolled_lapack.hh
 create mode 100644 test/test_basic_rng/test_distortion.cc

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
new file mode 100644
index 00000000..8453e805
--- /dev/null
+++ b/test/handrolled_lapack.hh
@@ -0,0 +1,54 @@
+
+#pragma once
+
+#include <blas.hh>
+#include <iostream>
+#include <vector>
+#include <cmath>
+
+
+
+// Function to perform Cholesky decomposition on a block
+template <typename T>
+void potrf_upper_colmajor_sequential(T* A, int64_t n, int64_t lda) {
+    for (int64_t j = 0; j < n; ++j) {
+        A[j + j * lda] = std::sqrt(A[j + j * lda]);
+        for (int64_t i = j + 1; i < n; ++i) {
+            A[j + i * lda] /= A[j + j * lda];
+        }
+        for (int64_t k = j + 1; k < n; ++k) {
+            for (int64_t i = k; i < n; ++i) {
+                A[k + i * lda] -= A[j + i * lda] * A[j + k * lda];
+            }
+        }
+    }
+}
+
+// Function to perform blocked Cholesky decomposition
+template <typename T>
+void potrf_upper_colmajor(T* A, int64_t n, int64_t block_size = 64) {
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+    for (int64_t k = 0; k < n; k += block_size) {
+        int64_t current_block_size = std::min(block_size, n - k);
+
+        int64_t offset = k + k * n;
+        // Perform Cholesky decomposition on the current block
+        potrf_upper_colmajor_sequential(A + offset, current_block_size, n);
+
+        if (k + current_block_size < n) {
+            // Update the trailing submatrix
+            blas::trsm(
+                layout, blas::Side::Right, uplo, blas::Op::Trans, blas::Diag::NonUnit,
+                n - k - current_block_size, current_block_size, 1.0,
+                A + offset, n, A + k + current_block_size * n, n
+            );
+            blas::syrk(
+                layout, uplo, blas::Op::NoTrans, 
+                n - k - current_block_size, current_block_size, -1.0,
+                A + k + current_block_size * n, n, 1.0,
+                A + (k + current_block_size) + (k + current_block_size) * n, n
+            );
+        }
+    }
+}
diff --git a/test/test_basic_rng/test_distortion.cc b/test/test_basic_rng/test_distortion.cc
new file mode 100644
index 00000000..a8fc1f02
--- /dev/null
+++ b/test/test_basic_rng/test_distortion.cc
@@ -0,0 +1,108 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+using RandBLAS::DenseDist;
+using RandBLAS::DenseDistName;
+using RandBLAS::RNGState;
+
+#include "rng_common.hh"
+
+#include <lapack.hh>
+#include <iostream>
+#include <vector>
+#include <gtest/gtest.h>
+
+// function [N_] = get_min_dimension(p_, tau_)
+//     N_ = ceil( ((sqrt(2*log(1 ./ p_)) + 1) ./ tau_).^2 );
+// end
+
+int64_t get_min_dimension(double p, double tau) {
+    double val = std::sqrt(-2 * std::log(p)) + 1;
+    val /= tau;
+    val *= val;
+    return (int64_t) std::ceil(val);
+}
+
+// TODO: implement Cholesky and Krylov subspace method for computing condition number.
+/*
+
+
+*/
+
+class TestSubspaceDistortion : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_gaussian(
+        T distortion, T tau, T p_fail_bound, uint32_t key
+    ) {
+        /**
+         *  Generate a d-by-N Gaussian matrix, where d = gamma*N,
+         *  gamma = (r/delta)^2, and N is the smallest integer where n > N implies
+         *  n*(r - 1  - 1/sqrt(n))^2 >= 2*log(1/p_fail_bound).
+         *  One can verify that this value for N is given as
+         *      N = ceil( ([sqrt(2*log(1/p)) + 1]/(r-1))^2 )   if   r > 1
+         * 
+         *  With probability at least 1 - p_fail_bound, the spectrum
+         *  of the generated matrix will lay in the interval
+         *  [1 - distortion, 1 + distortion].
+         * 
+         * ----------------------
+         * Temporary notes
+         * ----------------------
+         * Find N = min{ n : exp(-t^2 gamma n ) <= p_fail_bound }, where
+         *       t := delta - (gamma)^{-1/2}(1 + 1/sqrt(n)), and 
+         *   gamma := (r/delta)^2.
+         * 
+         * Choosing gamma of this form with r > 1 ensures that no 
+         * matter the value of delta in (0, 1) there always an N
+         * so that probability bound holds whenever n >= N. We know of no bounds
+         * available when r is in (0, 1).
+         * 
+         * ---------------------
+         * if delta = 1/sqrt(2) then we're looking at gamma = 2*r^2.
+         * Or, setting r = 1 + tau for tau > 0, we're looking at gamma=2*(1+tau)^2
+         * to get convergence rate tau in the sense that we fall below a target
+         * failure probability once
+         *  N = ceil( ( [sqrt(2*log(1/p)) + 1] / tau )^2 )
+         *  
+         */ 
+        int64_t N = get_min_dimension(p_fail_bound, tau);
+        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) );
+        DenseDist D(d, N, DenseDistName::Gaussian);
+        std::vector<T> S(d*N);
+        RandBLAS::fill_dense(D, S.data(), {key});
+        // need LAPACK ...
+        return;
+    }
+};

From 2a004471e9d36d70bbdaa7304eaec610c65292c6 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 10:44:23 -0400
Subject: [PATCH 22/37] basic QR by blocked Gram-Schmidt

---
 test/handrolled_lapack.hh | 82 ++++++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 8453e805..8b6c3d52 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -8,10 +8,14 @@
 
 
 
-// Function to perform Cholesky decomposition on a block
 template <typename T>
-void potrf_upper_colmajor_sequential(T* A, int64_t n, int64_t lda) {
+void potrf_upper_colmajor_sequential(int64_t n, T* A, int64_t lda) {
     for (int64_t j = 0; j < n; ++j) {
+        if (A[j + j * lda] <= 0) {
+            std::stringstream s;
+            s << "Cholesky failed at index " << j << " of " << n ".\n";
+            throw std::runtime_error(s.str());
+        }
         A[j + j * lda] = std::sqrt(A[j + j * lda]);
         for (int64_t i = j + 1; i < n; ++i) {
             A[j + i * lda] /= A[j + j * lda];
@@ -24,31 +28,83 @@ void potrf_upper_colmajor_sequential(T* A, int64_t n, int64_t lda) {
     }
 }
 
-// Function to perform blocked Cholesky decomposition
 template <typename T>
-void potrf_upper_colmajor(T* A, int64_t n, int64_t block_size = 64) {
+void potrf_upper_colmajor(int64_t n, T* A, int64_t b = 64) {
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
-    for (int64_t k = 0; k < n; k += block_size) {
-        int64_t current_block_size = std::min(block_size, n - k);
+    for (int64_t k = 0; k < n; k += b) {
+        int64_t curr_b = std::min(b, n - k);
 
         int64_t offset = k + k * n;
         // Perform Cholesky decomposition on the current block
-        potrf_upper_colmajor_sequential(A + offset, current_block_size, n);
+        potrf_upper_colmajor_sequential(curr_b, A + offset, n);
 
-        if (k + current_block_size < n) {
+        if (k + curr_b < n) {
             // Update the trailing submatrix
             blas::trsm(
                 layout, blas::Side::Right, uplo, blas::Op::Trans, blas::Diag::NonUnit,
-                n - k - current_block_size, current_block_size, 1.0,
-                A + offset, n, A + k + current_block_size * n, n
+                n - k - curr_b, curr_b, 1.0,
+                A + offset, n, A + k + curr_b * n, n
             );
             blas::syrk(
                 layout, uplo, blas::Op::NoTrans, 
-                n - k - current_block_size, current_block_size, -1.0,
-                A + k + current_block_size * n, n, 1.0,
-                A + (k + current_block_size) + (k + current_block_size) * n, n
+                n - k - curr_b, curr_b, -1.0,
+                A + k + curr_b * n, n, 1.0,
+                A + (k + curr_b) + (k + curr_b) * n, n
             );
         }
     }
 }
+
+template <typename T>
+void chol_qr_colmajor(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32) {
+    int64_t lda = m;
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+    std::fill(R, R + n*n, (T) 0.0);
+    blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T) 1.0, A, lda, (T) 0.0, R, n);
+    potrf_upper_colmajor(n, R, chol_block_size);
+    blas::trsm(layout, blas::Side::Right, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, m, n, (T) 1.0, R, n, A, lda);
+}
+
+template <typename T>
+void block_gram_schmidt(int64_t m, int64_t n, T* A, std::vector<T> &work, int64_t b = 64) {
+    b = std::min(b, n);
+    if (work.size() < n*b) {
+        work.resize(n*b);
+    }
+    auto layout = blas::Layout::ColMajor;
+    using blas::Op;
+    chol_qr_colmajor(m, b, A, work.data(), b);
+    T one  = 1.0;
+    T zero = 0.0;
+    if (b < n) {
+        int64_t n_trail = n - b;
+        T* A1 = A;         // A[:, :b]
+        T* A2 = A + b * m; // A[:, b:]
+        // Compute A1tA2 := A1' * A2 and then update A2 -= A1 *  A1tA2
+        T* A1tA2 = work.data();
+        blas::gemm(layout, Op::Trans,   Op::NoTrans, b, n_trail, m,  one, A1, m, A2,    m, zero, A1tA2, b);
+        blas::gemm(layout, Op::NoTrans, Op::NoTrans, m, n_trail, b, -one, A1, m, A1tA2, b,  one, A2,    m);
+        block_gram_schmidt(m, n - b, A + b * m, work, b);
+    }
+}
+
+template <typename T>
+void block_chol_qr(int64_t n, T* A, T* R, int64_t b = 64) {
+    b = std::min(b, n);
+    std::vector<T> work_orth(n*b);
+    std::vector<T> work_R(n*n);
+    T* A_copy = work_R.data();
+    blas::copy(n*n, A, 1, A_copy, 1);
+    block_gram_schmidt(n, n, A, work_orth, b);
+    using blas::Layout;
+    using blas::Op;
+    blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, n, (T)1.0, A, n, A_copy, n, (T) 0.0, R, n);
+}
+
+// TODOs:
+//  1) change block_gram_schmidt(...) to use modified Gram-Schmidt instead of classic Gram-Schmit.
+//  2) merge block_gram_schmidt and block_chol_qr so that block_gram_schmidt builds R as it goes.
+//      --> This might require some attention to submatrix pointers for R.
+///

From 2d89a1feda29f2f54d5a6fffe19ff97bd0e4fcd7 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 11:01:06 -0400
Subject: [PATCH 23/37] implement QR via block classical Gram-Schmidt

---
 test/handrolled_lapack.hh | 45 ++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 8b6c3d52..a8e6cefd 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -7,7 +7,6 @@
 #include <cmath>
 
 
-
 template <typename T>
 void potrf_upper_colmajor_sequential(int64_t n, T* A, int64_t lda) {
     for (int64_t j = 0; j < n; ++j) {
@@ -68,43 +67,45 @@ void chol_qr_colmajor(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size
 }
 
 template <typename T>
-void block_gram_schmidt(int64_t m, int64_t n, T* A, std::vector<T> &work, int64_t b = 64) {
+void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, std::vector<T>& work, int64_t b) {
+    if (n > m)
+        throw std::runtime_error("Invalid dimensions.");
+
     b = std::min(b, n);
-    if (work.size() < n*b) {
-        work.resize(n*b);
+    if (work.size() < n * b) {
+        work.resize(n * b);
     }
     auto layout = blas::Layout::ColMajor;
     using blas::Op;
     chol_qr_colmajor(m, b, A, work.data(), b);
-    T one  = 1.0;
-    T zero = 0.0;
+    T one  = (T) 1.0;
+    T zero = (T) 0.0;
+    T* R1 = work.data();
+    for (int64_t j = 0; j < b; ++j)
+        blas::copy(b, R1 + b*j, 1, R + n*j, 1);
+
     if (b < n) {
         int64_t n_trail = n - b;
         T* A1 = A;         // A[:, :b]
-        T* A2 = A + b * m; // A[:, b:]
+        T* A2 = A + m * b; // A[:, b:]
+        T* R2 = R + n * b; // R[:b, b:]
         // Compute A1tA2 := A1' * A2 and then update A2 -= A1 *  A1tA2
         T* A1tA2 = work.data();
         blas::gemm(layout, Op::Trans,   Op::NoTrans, b, n_trail, m,  one, A1, m, A2,    m, zero, A1tA2, b);
         blas::gemm(layout, Op::NoTrans, Op::NoTrans, m, n_trail, b, -one, A1, m, A1tA2, b,  one, A2,    m);
-        block_gram_schmidt(m, n - b, A + b * m, work, b);
+        // Copy A1tA2 to the appropriate place in R
+        for (int64_t j = 0; j < n_trail; ++j) {
+            blas::copy(b, A1tA2 + j*b, 1, R2 + j*n, 1);
+        }
+        qr_block_cgs(m, n_trail, A + b * m, R + b * n + b, work, b);
     }
 }
 
 template <typename T>
-void block_chol_qr(int64_t n, T* A, T* R, int64_t b = 64) {
+void qr_block_cgs(int64_t n, T* A, T* R, int64_t b = 64) {
     b = std::min(b, n);
-    std::vector<T> work_orth(n*b);
-    std::vector<T> work_R(n*n);
-    T* A_copy = work_R.data();
-    blas::copy(n*n, A, 1, A_copy, 1);
-    block_gram_schmidt(n, n, A, work_orth, b);
-    using blas::Layout;
-    using blas::Op;
-    blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, n, (T)1.0, A, n, A_copy, n, (T) 0.0, R, n);
+    std::vector<T> work(n * b);
+    std::fill(R, R + n * n, (T) 0.0);
+    qr_block_cgs(n, n, A, R, work, b);
 }
 
-// TODOs:
-//  1) change block_gram_schmidt(...) to use modified Gram-Schmidt instead of classic Gram-Schmit.
-//  2) merge block_gram_schmidt and block_chol_qr so that block_gram_schmidt builds R as it goes.
-//      --> This might require some attention to submatrix pointers for R.
-///

From ea1b0b665effd3274c79db16ca8dd07f4474807f Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 11:02:34 -0400
Subject: [PATCH 24/37] rename functions

---
 test/handrolled_lapack.hh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index a8e6cefd..96a8e4a2 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -8,7 +8,7 @@
 
 
 template <typename T>
-void potrf_upper_colmajor_sequential(int64_t n, T* A, int64_t lda) {
+void potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
     for (int64_t j = 0; j < n; ++j) {
         if (A[j + j * lda] <= 0) {
             std::stringstream s;
@@ -28,7 +28,7 @@ void potrf_upper_colmajor_sequential(int64_t n, T* A, int64_t lda) {
 }
 
 template <typename T>
-void potrf_upper_colmajor(int64_t n, T* A, int64_t b = 64) {
+void potrf_upper(int64_t n, T* A, int64_t b = 64) {
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
     for (int64_t k = 0; k < n; k += b) {
@@ -36,7 +36,7 @@ void potrf_upper_colmajor(int64_t n, T* A, int64_t b = 64) {
 
         int64_t offset = k + k * n;
         // Perform Cholesky decomposition on the current block
-        potrf_upper_colmajor_sequential(curr_b, A + offset, n);
+        potrf_upper_sequential(curr_b, A + offset, n);
 
         if (k + curr_b < n) {
             // Update the trailing submatrix
@@ -56,13 +56,13 @@ void potrf_upper_colmajor(int64_t n, T* A, int64_t b = 64) {
 }
 
 template <typename T>
-void chol_qr_colmajor(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32) {
+void chol_qr(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32) {
     int64_t lda = m;
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
     std::fill(R, R + n*n, (T) 0.0);
     blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T) 1.0, A, lda, (T) 0.0, R, n);
-    potrf_upper_colmajor(n, R, chol_block_size);
+    potrf_upper(n, R, chol_block_size);
     blas::trsm(layout, blas::Side::Right, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, m, n, (T) 1.0, R, n, A, lda);
 }
 
@@ -77,7 +77,7 @@ void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, std::vector<T>& work, int64_
     }
     auto layout = blas::Layout::ColMajor;
     using blas::Op;
-    chol_qr_colmajor(m, b, A, work.data(), b);
+    chol_qr(m, b, A, work.data(), b);
     T one  = (T) 1.0;
     T zero = (T) 0.0;
     T* R1 = work.data();

From b726e7e7dda54797209321fa7a5a796aca02ec84 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 5 Aug 2024 11:42:22 -0400
Subject: [PATCH 25/37] function to compute eigenvalues by QR iteration

---
 test/handrolled_lapack.hh | 53 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 96a8e4a2..5d422918 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -1,6 +1,8 @@
 
 #pragma once
 
+#include "RandBLAS/util.hh"
+
 #include <blas.hh>
 #include <iostream>
 #include <vector>
@@ -101,6 +103,7 @@ void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, std::vector<T>& work, int64_
     }
 }
 
+
 template <typename T>
 void qr_block_cgs(int64_t n, T* A, T* R, int64_t b = 64) {
     b = std::min(b, n);
@@ -109,3 +112,53 @@ void qr_block_cgs(int64_t n, T* A, T* R, int64_t b = 64) {
     qr_block_cgs(n, n, A, R, work, b);
 }
 
+/**
+ * Use QR iteration to compute all eigenvalues of a positive definite matrix A.
+ * Run for at most "max_iters" iteration.
+ * 
+ * In each iteration we'll perform some non-standard work to estimate the accuracy of
+ * the eigenvalues. To explain, suppose "R" is our current triangular factor. We'll 
+ * compute the Gram matrix G = R'R. The eigenvalues of G are the squares of the
+ * singular values of R, and the singular values of R are equal to the eigenvaleus of A.
+ * Therefore our estimate for the eigenvalues of A will be the square root of diag(G). 
+ * We can terminate the algorithm once the relative radius of each Gershgorin disc
+ * falls below reltol.
+ *
+ * 
+ */
+template <typename T>
+void eig_qr_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
+    std::vector<T> workspace(2 * n * n);
+    T* R = workspace.data();
+    T* G = R + n * n;
+
+    using blas::Op;
+    using blas::Layout;
+    using blas::Uplo;
+
+    for (int64_t iter = 0; iter < max_iters; ++iter) {
+        qr_block_cgs(n, n, A, R, workspace, b);
+        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, n, (T) 1.0, R, n, (T) 0.0, G, n);
+        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, G, n, n);
+        for (int64_t i = 0; i < n; ++i)
+            eigvals[i] = std::sqrt(G[i * n + i]);
+        
+        bool converged = true;
+        int64_t i = 0;
+        while(i < n && converged) {
+            T radius = 0.0;
+            for (int64_t j = 0; j < n; ++j) {
+                if (i != j)
+                    radius += std::abs(G[i * n + j]);
+            }
+            converged = radius <= reltol*G[i * n + i];
+            ++i;
+        }
+        if (converged)
+            break;
+        // Update A = R * Q
+        blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, (T) 1.0, R, n, A, n, (T) 0.0, G, n);
+        blas::copy(n * n, G, 1, A, 1);
+    }
+    return;
+}

From a0557d18f4c8fc0dac83ee113d1500e538b0ecbe Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 9 Aug 2024 10:31:36 -0400
Subject: [PATCH 26/37] cholqr2, block_cgs2, cholesky iteration for eigenvalues
 of positive definite matrices

---
 RandBLAS/dense_skops.hh                       |   2 +-
 RandBLAS/util.hh                              |  28 +-
 test/CMakeLists.txt                           |  12 +
 test/comparison.hh                            |  11 +
 test/handrolled_lapack.hh                     | 215 ++++++++----
 test/test_handrolled_lapack.cc                | 327 ++++++++++++++++++
 .../test_sketch_symmetric.cc                  |   2 +-
 7 files changed, 529 insertions(+), 68 deletions(-)
 create mode 100644 test/test_handrolled_lapack.cc

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 97a81601..887ed656 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -334,7 +334,7 @@ inline T isometry_scale_factor(DenseDist D) {
         return common;
     } else if (D.family == DenseDistName::Uniform) {
         // the variance of an r.v. distributed Unif[-1, 1] is 4/12=1/3.
-        return 3*common;
+        return std::sqrt(3)*common;
     } else {
         throw std::runtime_error("Unrecognized distribution.");
     }
diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index 65de44af..5ad91711 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -59,7 +59,7 @@ void safe_scal(int64_t n, T a, T* x, int64_t inc_x) {
 }
 
 template <typename T>
-void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, char label[])
+void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, const char label[])
 {
 	int64_t i, j;
     T val;
@@ -118,8 +118,7 @@ std::string type_name() { // call as type_name<obj>()
 }
 
 template <typename T>
-void symmetrize(blas::Layout layout, blas::Uplo uplo, T* A, int64_t n, int64_t lda) { 
-
+void symmetrize(blas::Layout layout, blas::Uplo uplo, int64_t n, T* A, int64_t lda) { 
     auto [inter_row_stride, inter_col_stride] = layout_to_strides(layout, lda);
     #define matA(_i, _j) A[(_i)*inter_row_stride + (_j)*inter_col_stride]
     if (uplo == blas::Uplo::Upper) {
@@ -141,6 +140,29 @@ void symmetrize(blas::Layout layout, blas::Uplo uplo, T* A, int64_t n, int64_t l
     return;
 }
 
+template <typename T>
+void overwrite_triangle(blas::Layout layout, blas::Uplo to_overwrite, int64_t n, int64_t strict_offset, T val,  T* A, int64_t lda) {
+    auto [inter_row_stride, inter_col_stride] = layout_to_strides(layout, lda);
+    #define matA(_i, _j) A[(_i)*inter_row_stride + (_j)*inter_col_stride]
+    if (to_overwrite == blas::Uplo::Upper) {
+        for (int64_t i = 0; i < n; ++i) {
+            for (int64_t j = i + strict_offset; j < n; ++j) {
+                matA(i,j) = val;
+            }
+        }
+    } else if (to_overwrite == blas::Uplo::Lower) {
+        for (int64_t i = 0; i < n; ++i) {
+            for (int64_t j = i + strict_offset; j < n; ++j) {
+                matA(j,i) = val;
+            }
+        }
+    } else {
+        throw std::runtime_error("Invalid argument for UPLO.");
+    }
+    #undef matA
+    return;
+}
+
 template <typename T>
 void require_symmetric(blas::Layout layout, const T* A, int64_t n, int64_t lda, T tol) { 
     if (tol < 0)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 954689cb..18a719b1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,6 +61,18 @@ if (GTest_FOUND)
     file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
     file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
 
+    #####################################################################
+    #
+    #   Meta tests (test our testing infrastructure)
+    #
+    #####################################################################
+
+    add_executable(MetaRandBLAS_tests
+        test_handrolled_lapack.cc
+    )
+    target_link_libraries(MetaRandBLAS_tests RandBLAS GTest::GTest GTest::Main)
+    gtest_discover_tests(MetaRandBLAS_tests)
+
 endif()
 message(STATUS "Checking for regression tests ... ${tmp}")
 
diff --git a/test/comparison.hh b/test/comparison.hh
index 15efd9f1..a02cf4a9 100644
--- a/test/comparison.hh
+++ b/test/comparison.hh
@@ -81,6 +81,17 @@ bool approx_equal(T A, T B, std::ostream &str,
     return false;
 }
 
+template <typename T>
+void approx_equal(T A, T B, const char* testName, const char* fileName, int lineNo, T atol, T rtol) {
+    std::ostringstream oss;
+    if (!approx_equal(A, B, oss, atol, rtol)) {
+        FAIL() << std::endl << fileName << ":" << lineNo << std::endl
+            << testName << std::endl << "Test failed. " << oss.str() << std::endl;
+        oss.str("");
+    }
+    return;
+}
+
 
 
 /** Test two arrays are approximately equal elementwise.
diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 5d422918..859fcff3 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -8,13 +8,16 @@
 #include <vector>
 #include <cmath>
 
+namespace hr_lapack {
 
 template <typename T>
 void potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
+    // Cache access is much better if the matrix is lower triangular.
+    // Could implement as lower triangular and then call transpose_square.
     for (int64_t j = 0; j < n; ++j) {
         if (A[j + j * lda] <= 0) {
             std::stringstream s;
-            s << "Cholesky failed at index " << j << " of " << n ".\n";
+            s << "Cholesky failed at index " << j << " of " << n << ".";
             throw std::runtime_error(s.str());
         }
         A[j + j * lda] = std::sqrt(A[j + j * lda]);
@@ -30,86 +33,141 @@ void potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
 }
 
 template <typename T>
-void potrf_upper(int64_t n, T* A, int64_t b = 64) {
+void potrf_upper(int64_t n, T* A, int64_t lda, int64_t b = 64) {
+    randblas_require(b > 0);
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
-    for (int64_t k = 0; k < n; k += b) {
-        int64_t curr_b = std::min(b, n - k);
-
-        int64_t offset = k + k * n;
-        // Perform Cholesky decomposition on the current block
-        potrf_upper_sequential(curr_b, A + offset, n);
-
-        if (k + curr_b < n) {
-            // Update the trailing submatrix
-            blas::trsm(
-                layout, blas::Side::Right, uplo, blas::Op::Trans, blas::Diag::NonUnit,
-                n - k - curr_b, curr_b, 1.0,
-                A + offset, n, A + k + curr_b * n, n
-            );
-            blas::syrk(
-                layout, uplo, blas::Op::NoTrans, 
-                n - k - curr_b, curr_b, -1.0,
-                A + k + curr_b * n, n, 1.0,
-                A + (k + curr_b) + (k + curr_b) * n, n
-            );
-        }
+    int64_t curr_b = std::min(b, n);
+    //  A = [A11, A12]
+    //      [*  , A22]
+    potrf_upper_sequential(curr_b, A, lda);
+    //  A = [R11, A12]
+    //      [*  , A22]
+    if (curr_b < n) {
+        T* R11 = A;                // shape (curr_b,     curr_b    )
+        T* A12 = R11 + curr_b*lda; // shape (curr_b,     n - curr_b)
+        T* A22 = A12 + curr_b;     // shape (n - curr_b, n - curr_b)
+        blas::trsm(
+            layout, blas::Side::Left, uplo, blas::Op::Trans, blas::Diag::NonUnit,
+            curr_b, n - curr_b, (T) 1.0, R11, lda, A12, lda
+        );
+        blas::syrk(layout, uplo, blas::Op::Trans, 
+            n - curr_b, curr_b, (T) -1.0, A12, lda, (T) 1.0, A22, lda
+        );
+        potrf_upper(n-curr_b, A22, lda, b);
     }
+    return;
 }
 
+// If twice=true, then R will also be used as workspace, and must have length at least 2*n*n.
 template <typename T>
-void chol_qr(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32) {
+void chol_qr(int64_t m, int64_t n, T* A, T* R, int64_t chol_block_size = 32, bool twice = false) {
     int64_t lda = m;
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
     std::fill(R, R + n*n, (T) 0.0);
     blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T) 1.0, A, lda, (T) 0.0, R, n);
-    potrf_upper(n, R, chol_block_size);
+    potrf_upper(n, R, n, chol_block_size);
     blas::trsm(layout, blas::Side::Right, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, m, n, (T) 1.0, R, n, A, lda);
+    if (twice) {
+        T* R2 = R + n*n;
+        chol_qr(m, n, A, R2, chol_block_size, false);
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Lower, n, 1, (T) 0.0, R,  n);
+        // now overwrite R = R2 R with TRMM (saying R2 is the triangular matrix)
+        blas::trmm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, blas::Diag::NonUnit, n, n, (T) 1.0, R2, n, R, n);
+    }
+    return;
 }
 
+// work must be length at least max(n, 2*b) * b
 template <typename T>
-void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, std::vector<T>& work, int64_t b) {
+void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, int64_t ldr, T* work, int64_t b) {
     if (n > m)
         throw std::runtime_error("Invalid dimensions.");
+    randblas_require(ldr >= n);
 
     b = std::min(b, n);
-    if (work.size() < n * b) {
-        work.resize(n * b);
-    }
     auto layout = blas::Layout::ColMajor;
     using blas::Op;
-    chol_qr(m, b, A, work.data(), b);
+    chol_qr(m, b, A, work, b, true);
     T one  = (T) 1.0;
     T zero = (T) 0.0;
-    T* R1 = work.data();
+    T* R1 = work;
     for (int64_t j = 0; j < b; ++j)
-        blas::copy(b, R1 + b*j, 1, R + n*j, 1);
+        blas::copy(b, R1 + b*j, 1, R + ldr*j, 1);
 
     if (b < n) {
         int64_t n_trail = n - b;
-        T* A1 = A;         // A[:, :b]
-        T* A2 = A + m * b; // A[:, b:]
-        T* R2 = R + n * b; // R[:b, b:]
+        T* A1 = A;           // A[:,  :b]
+        T* A2 = A + m * b;   // A[:,  b:]
+        T* R2 = R + ldr * b; // R[:b, b:]
         // Compute A1tA2 := A1' * A2 and then update A2 -= A1 *  A1tA2
-        T* A1tA2 = work.data();
+        T* A1tA2 = work;
         blas::gemm(layout, Op::Trans,   Op::NoTrans, b, n_trail, m,  one, A1, m, A2,    m, zero, A1tA2, b);
         blas::gemm(layout, Op::NoTrans, Op::NoTrans, m, n_trail, b, -one, A1, m, A1tA2, b,  one, A2,    m);
         // Copy A1tA2 to the appropriate place in R
         for (int64_t j = 0; j < n_trail; ++j) {
-            blas::copy(b, A1tA2 + j*b, 1, R2 + j*n, 1);
+            blas::copy(b, A1tA2 + j*b, 1, R2 + j*ldr, 1);
         }
-        qr_block_cgs(m, n_trail, A + b * m, R + b * n + b, work, b);
+        qr_block_cgs(m, n_trail, A2, R2 + b, ldr, work, b);
     }
+    return;
 }
 
 
+// We'll resize bigwork to be length at least (n*n + max(n, 2*b) * b).
 template <typename T>
-void qr_block_cgs(int64_t n, T* A, T* R, int64_t b = 64) {
+void qr_block_cgs2(int64_t m, int64_t n, T* A, T* R, std::vector<T> &bigwork, int64_t b = 64) {
     b = std::min(b, n);
-    std::vector<T> work(n * b);
+    int64_t littlework_size = std::max(n, 2*b) * b;
+    int64_t bigwork_size = n*n + littlework_size;
+    if ((int64_t)bigwork.size() < bigwork_size) {
+        bigwork.resize(bigwork_size);
+    }
+    T* R2 = bigwork.data();
+    T* littlework = R2 + n*n;
     std::fill(R, R + n * n, (T) 0.0);
-    qr_block_cgs(n, n, A, R, work, b);
+    qr_block_cgs(m, n, A, R, n, littlework, b);
+    RandBLAS::util::overwrite_triangle(blas::Layout::ColMajor, blas::Uplo::Lower, n, 1, (T) 0.0, R, n);
+    qr_block_cgs(m, n, A, R2, n, littlework, b);
+    blas::trmm(
+        blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
+        n, n, 1.0, R2, n, R, n
+    );
+    return;
+}
+
+template <typename T>
+bool extremal_eigvals_converged_gershgorin(int64_t n, T* G, T tol) {
+    int64_t i_lb = 0;
+    int64_t i_ub = 0;
+    T upper = -std::numeric_limits<T>::infinity();
+    T lower =  std::numeric_limits<T>::infinity();
+    for (int64_t i = 0; i < n; ++i) {
+        T radius = 0.0;
+        T center = G[i + i*n];
+        for (int64_t j = 0; j < n; ++j) {
+            if (i != j) {
+                radius += std::abs(G[i * n + j]);
+            }  
+        }
+        if (center + radius >= upper) {
+            i_ub = i;
+            upper = center + radius;
+        }
+        if (center - radius <= lower) {
+            i_lb = i;
+            lower = center - radius;
+        }
+    }
+    T lower_center = G[i_lb + i_lb*n];
+    T upper_center = G[i_ub + i_ub*n];
+    T lower_radius = std::abs(lower - lower_center);
+    T upper_radius = std::abs(upper - upper_center);
+    bool lower_converged = lower_radius <= tol * lower_center;
+    bool upper_converged = upper_radius <= tol * upper_center;
+    bool converged = lower_converged && upper_converged;
+    return converged;
 }
 
 /**
@@ -121,44 +179,75 @@ void qr_block_cgs(int64_t n, T* A, T* R, int64_t b = 64) {
  * compute the Gram matrix G = R'R. The eigenvalues of G are the squares of the
  * singular values of R, and the singular values of R are equal to the eigenvaleus of A.
  * Therefore our estimate for the eigenvalues of A will be the square root of diag(G). 
- * We can terminate the algorithm once the relative radius of each Gershgorin disc
- * falls below reltol.
+ * We can terminate the algorithm once the relative radii of G's Gershgorin discs
+ * fall below reltol^2.
  *
  * 
  */
 template <typename T>
-void eig_qr_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
-    std::vector<T> workspace(2 * n * n);
-    T* R = workspace.data();
-    T* G = R + n * n;
+int64_t posdef_eig_qr_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
+    b = std::min(b, n);
+
+    int64_t subroutine_work_size = n*n + std::max(n, 2*b) * b;
+    std::vector<T> work(n*n + subroutine_work_size);
+    T* subroutine_work = work.data();
+    T* G = subroutine_work;
+    T* R = subroutine_work + subroutine_work_size;
 
     using blas::Op;
     using blas::Layout;
     using blas::Uplo;
 
-    for (int64_t iter = 0; iter < max_iters; ++iter) {
-        qr_block_cgs(n, n, A, R, workspace, b);
+    T sq_reltol = reltol * reltol;
+    int64_t iter = 0;
+    bool converged = false;
+    for (; iter < max_iters; ++iter) {
+        qr_block_cgs2(n, n, A, R, work, b);
         blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, n, (T) 1.0, R, n, (T) 0.0, G, n);
-        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, G, n, n);
+        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, G, n);
         for (int64_t i = 0; i < n; ++i)
             eigvals[i] = std::sqrt(G[i * n + i]);
-        
-        bool converged = true;
-        int64_t i = 0;
-        while(i < n && converged) {
-            T radius = 0.0;
-            for (int64_t j = 0; j < n; ++j) {
-                if (i != j)
-                    radius += std::abs(G[i * n + j]);
-            }
-            converged = radius <= reltol*G[i * n + i];
-            ++i;
-        }
+        converged = extremal_eigvals_converged_gershgorin(n, G, sq_reltol);
         if (converged)
             break;
         // Update A = R * Q
         blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, (T) 1.0, R, n, A, n, (T) 0.0, G, n);
         blas::copy(n * n, G, 1, A, 1);
     }
-    return;
+    return (converged) ? iter : -iter;
+}
+
+/**
+ * Use Cholesky iteration to compute all eigenvalues of a positive definite matrix A.
+ * Run for at most "max_iters" iteration.
+ * 
+ * Use the Gershgorin circle theorem as a stopping criteria.
+ *
+ */
+template <typename T>
+int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
+    b = std::min(b, n);
+    std::vector<T> work(n*n);
+    T* G = work.data();
+    using blas::Op;
+    using blas::Layout;
+    using blas::Uplo;
+
+    int64_t iter = 0;
+    bool converged = false;
+    for (; iter < max_iters; ++iter) {
+        potrf_upper(n, A, n, b);
+        RandBLAS::util::overwrite_triangle(Layout::ColMajor, Uplo::Lower, n, 1, (T) 0.0, A, n);
+        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::NoTrans, n, n, (T) 1.0, A, n, (T) 0.0, G, n);
+        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, G, n);
+        for (int64_t i = 0; i < n; ++i)
+            eigvals[i] = G[i * n + i];
+        converged = extremal_eigvals_converged_gershgorin(n, G, reltol);
+        if (converged)
+            break;
+        blas::copy(n * n, G, 1, A, 1);
+    }
+    return (converged) ? iter : -iter;
+}
+
 }
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
new file mode 100644
index 00000000..081701ee
--- /dev/null
+++ b/test/test_handrolled_lapack.cc
@@ -0,0 +1,327 @@
+
+
+#include <cmath>
+#include <blas.hh>
+
+#include "RandBLAS/util.hh"
+#include "handrolled_lapack.hh"
+#include "test_basic_rng/rng_common.hh"
+#include "comparison.hh"
+#include "RandBLAS/config.h"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
+using RandBLAS::DenseDist;
+using RandBLAS::DenseDistName;
+using RandBLAS::RNGState;
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+// MARK: Cholesky
+
+class TestHandrolledCholesky : public ::testing::Test {
+    protected:
+
+    template <typename T, typename FUNC>
+    void run_factor_gram_matrix(int n, FUNC &cholfunc, uint32_t key) {
+        auto layout = blas::Layout::ColMajor;
+        int64_t m = 2*n;
+        DenseDist D(m, n);
+        std::vector<T> A(n*n);
+        std::vector<T> B(m*n);
+        T iso_scale = std::pow(RandBLAS::isometry_scale_factor<T>(D), 2);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, B.data(), state);
+        std::vector<T> C(B);
+
+        // define positive definite A
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, m, iso_scale, B.data(), m, 0.0, A.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, A.data(), n);
+        // overwrite A by its upper-triangular cholesky factor
+        cholfunc(n, A.data());
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Lower, n, 1, (T) 0.0, A.data(), n);
+
+        // compute the gram matrix of A's cholesky factor
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, n, 1.0, A.data(), n, 0.0, B.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, B.data(), n);
+        // recompute A
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, n, m, iso_scale, C.data(), m, 0.0, A.data(), n);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, n, A.data(), n);
+
+        test::comparison::matrices_approx_equal(layout, blas::Op::NoTrans, n, n, B.data(), n, A.data(), n, 
+            __PRETTY_FUNCTION__, __FILE__, __LINE__
+        );
+    }
+
+    template <typename T>
+    void run_sequential_factor_gram_matrix(int n, uint32_t key) {
+        auto cholfunc = [](int64_t n, T* A) { hr_lapack::potrf_upper_sequential(n, A, n); };
+        run_factor_gram_matrix<T>(n, cholfunc, key);
+    }
+
+    template <typename T>
+    void run_blocked_factor_gram_matrix(int n, int b, uint32_t key) {
+        auto cholfunc = [b](int64_t n, T* A) { hr_lapack::potrf_upper(n, A, n, b); };
+        run_factor_gram_matrix<T>(n, cholfunc, key);
+    }
+
+    template <typename T>
+    void run_blocked_factor_diagonal(int n, int b) {
+        auto layout = blas::Layout::ColMajor;
+        std::vector<T> A(n*n, 0.0);
+        std::vector<T> B(n*n);
+        for (int i = 0; i < n; ++i) {
+            A[i+i*n] = 1.0/((T) i + 1.0);
+            B[i+i*n] = std::sqrt(A[i+i*n]);
+        }
+        hr_lapack::potrf_upper(n, A.data(), n, b);
+        test::comparison::matrices_approx_equal(layout, blas::Op::NoTrans, n, n, B.data(), n, A.data(), n, 
+            __PRETTY_FUNCTION__, __FILE__, __LINE__
+        );
+    }
+
+};
+
+TEST_F(TestHandrolledCholesky, sequential_random_gram_matrix) {
+    for (int i = 1; i <= 64; ++i) {
+        run_sequential_factor_gram_matrix<float>(i, 0);
+    }
+    for (int i = 1; i <= 64; ++i) {
+        run_sequential_factor_gram_matrix<float>(i, 1);
+    }
+}
+
+TEST_F(TestHandrolledCholesky, blocked_diagonal) {
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_diagonal<float>(i, 2);
+    }
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_diagonal<float>(i, 3);
+    }
+    for (int i = 2; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i, i/2);
+    }
+    for (int i = 16; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i, i-7);
+    }
+    for (int i = 16; i <= 256; i*=2) {
+        run_blocked_factor_diagonal<float>(i-3, i/4);
+    }
+}
+
+TEST_F(TestHandrolledCholesky, blocked_random_gram_matrix) {
+    for (int i = 2; i <= 16; i+=2) {
+        run_blocked_factor_gram_matrix<float>(i, 1, 0);
+    }
+    for (int i = 2; i <= 64; i+=2) {
+        run_blocked_factor_gram_matrix<float>(i, 2, 0);
+    }
+    for (int i = 2; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i, i/2, 0);
+    }
+    for (int i = 16; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i, i-5, 0);
+    }
+    for (int i = 16; i <= 96; i*=2) {
+        run_blocked_factor_gram_matrix<float>(i-5, 7, 0);
+    }
+}
+
+
+
+template <typename T>
+void verify_product(int64_t m, int64_t n, int64_t k, const T* A, const T* B, const T* C) {
+    using std::vector;
+    int64_t lda = m;
+    int64_t ldb = k;
+    int64_t ldc = m;
+    vector<T> C_work(m*n, 0.0);
+    blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, (T) 1.0, A, lda, B, ldb, (T) 0.0, C_work.data(), ldc);
+
+    vector<T> A_copy(m*k, 0.0);
+    vector<T> B_copy(k*n, 0.0);
+    blas::copy(m*k, A, 1, A_copy.data(), 1);
+    blas::copy(k*n, B, 1, B_copy.data(), 1);
+    std::for_each(A_copy.begin(), A_copy.end(), [](T &val) {val = std::abs(val); });
+    std::for_each(B_copy.begin(), B_copy.end(), [](T &val) {val = std::abs(val); });
+    vector<T> E(m*n, 0.0);
+    T err_alpha = 2 * k * std::numeric_limits<T>::epsilon();
+    blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, err_alpha, A_copy.data(), lda, B_copy.data(), ldb, (T)0.0, E.data(), m);
+    test::comparison::buffs_approx_equal(C_work.data(), C, E.data(), m*n, __PRETTY_FUNCTION__, __FILE__, __LINE__);
+    return;
+}
+
+template <typename T>
+void verify_orthonormal_componentwise(int m, int n, T* Q, T max_cond = 10) {
+    std::vector<T> I(n*n, 0.0);
+    for (int i = 0; i < n; ++i)
+        I[i + i*n] = 1.0;
+    std::vector<T> QtQ(n*n, 0.0);
+    T tol = max_cond * std::sqrt(m) * std::numeric_limits<T>::epsilon();
+    blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, n, n, m, (T) 1.0, Q, m, Q, m, (T) 0.0, QtQ.data(), n);
+    test::comparison::matrices_approx_equal(blas::Layout::ColMajor, blas::Op::NoTrans, n, n, QtQ.data(), n, I.data(), n,
+        __PRETTY_FUNCTION__, __FILE__, __LINE__, tol, tol
+    );
+    return;
+}
+
+// MARK: QR
+
+class TestHandrolledQR : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_cholqr_gaussian(int m, int n, int b, uint32_t key) {
+        DenseDist D(m, n, DenseDistName::Gaussian);
+        std::vector<T> A(m*n);
+        T iso_scale = RandBLAS::isometry_scale_factor<T>(D);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, A.data(), state);
+        blas::scal(m*n, iso_scale, A.data(), 1);
+
+        std::vector<T> Q(A);
+        std::vector<T> R(2*n*n);
+        hr_lapack::chol_qr(m, n, Q.data(), R.data(), b, true);
+        verify_orthonormal_componentwise(m, n, Q.data());
+        verify_product(m, n, n, Q.data(), R.data(), A.data());
+    }
+
+    template <typename T>
+    void run_qr_blocked_cgs(int m, int n, int b, uint32_t key) {
+        DenseDist D(m, n, DenseDistName::Gaussian);
+        std::vector<T> A(m*n);
+        T iso_scale = RandBLAS::isometry_scale_factor<T>(D);
+        RNGState state(key);
+        RandBLAS::fill_dense(D, A.data(), state);
+        blas::scal(m*n, iso_scale, A.data(), 1);
+
+        std::vector<T> Q(A);
+        std::vector<T> R(n*n);
+        std::vector<T> work{};
+        hr_lapack::qr_block_cgs2(m, n, Q.data(), R.data(), work, b);
+        verify_orthonormal_componentwise(m, n, Q.data());
+        verify_product(m, n, n, Q.data(), R.data(), A.data());
+    }
+
+};
+
+TEST_F(TestHandrolledQR, cholqr_small) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_cholqr_gaussian<float>(10, 1, 5, key);
+        run_cholqr_gaussian<float>(2,  2, 5, key);
+        run_cholqr_gaussian<float>(10, 6, 5, key);
+
+        run_cholqr_gaussian<float>(10, 1, 2, key);
+        run_cholqr_gaussian<float>(10, 2, 2, key);
+        run_cholqr_gaussian<float>(10, 6, 2, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, cholqr_medium) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_cholqr_gaussian<float>(1000, 100, 5,  key);
+        run_cholqr_gaussian<float>(1024, 128, 64, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, blocked_cgs_small) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_qr_blocked_cgs<float>(10,  5, 2, key);
+        run_qr_blocked_cgs<float>(10,  7, 2, key);
+        run_qr_blocked_cgs<float>(10, 10, 2, key);
+    }
+}
+
+TEST_F(TestHandrolledQR, blocked_cgs_medium) {
+    for (uint32_t key = 111; key < 112; ++key) {
+        run_qr_blocked_cgs<float>(1000, 1000, 64, key);
+        run_qr_blocked_cgs<float>(1024, 1024, 64, key);
+    }
+}
+
+// MARK: Eigenvalues
+
+template <typename T>
+std::vector<T> posdef_with_random_eigvecs(std::vector<T> &eigvals, uint32_t key) {
+    int64_t n = eigvals.size();
+    for (auto ev : eigvals) 
+        randblas_require(ev > 0);
+    std::vector<T> work0(n*n, 0.0);
+    T* work0_buff = work0.data();
+    DenseDist distn(n, n, DenseDistName::Gaussian);
+    RNGState state(key);
+    RandBLAS::fill_dense(distn, work0_buff, state);
+    std::vector<T> work1(n*n, 0.0);
+    std::vector<T> work2{};
+    hr_lapack::qr_block_cgs2(n, n, work0_buff, work1.data(), work2);
+    for (int i = 0; i < n; ++i)
+        blas::scal(n, std::sqrt(eigvals[i]), work0_buff + i*n, 1);
+    std::vector<T> out(n*n, 0.0);
+    blas::syrk(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, n, n, (T)1.0, work0_buff, n, (T)0.0, out.data(), n);
+    RandBLAS::util::symmetrize(blas::Layout::ColMajor, blas::Uplo::Upper, n, out.data(), n);
+    return out;
+}
+
+class TestHandrolledEigvals : public ::testing::Test {
+    protected:
+
+    template <typename T>
+    void run_diag(int n, int b) {
+        std::vector<T> A(n*n, 0.0);
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 1.0 / (1 + (T)i);
+            A[i + i*n] = eigvals_expect[i];
+        }
+        std::vector<T> eigvals_actual(n);
+        int64_t iters = 1;
+        T tol = 1e-3;
+        auto iter = hr_lapack::posdef_eig_chol_iteration(n, A.data(), eigvals_actual.data(), tol, iters, b);
+        ASSERT_EQ(iter, 0);
+        test::comparison::buffs_approx_equal(eigvals_actual.data(), eigvals_expect.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+
+    template <typename T>
+    void run_general_posdef(int n, int b, uint32_t key) {
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 1.0 / (1 + (T)i);
+        }
+        auto A = posdef_with_random_eigvecs(eigvals_expect, key);
+        std::vector<T> eigvals_actual(n);
+        int64_t iters = 1000;
+        T tol = 1e-3;
+        auto iter = hr_lapack::posdef_eig_chol_iteration(n, A.data(), eigvals_actual.data(), tol, iters, b);
+        std::cout << "Number of QR iterations : " << iter << std::endl;
+        T min_eig_actual = *std::min_element(eigvals_actual.begin(), eigvals_actual.end());
+        T max_eig_actual = *std::max_element(eigvals_actual.begin(), eigvals_actual.end());
+        std::cout << "min_comp / min_actual " << min_eig_actual / eigvals_expect[n-1] << std::endl;
+        std::cout << "max_comp / max_actual " << max_eig_actual / eigvals_expect[0] << std::endl;
+        test::comparison::approx_equal(min_eig_actual, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        test::comparison::approx_equal(max_eig_actual, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+};
+
+TEST_F(TestHandrolledEigvals, diag) {
+    run_diag<double>(5, 2);
+    run_diag<double>(10, 5);
+    run_diag<double>(100, 10);
+}
+
+TEST_F(TestHandrolledEigvals, general_posdef_smallish) {
+    run_general_posdef<double>(  5,  2, 0);
+    run_general_posdef<double>( 10,  2, 0);
+    run_general_posdef<double>(100, 10, 0);
+}
+
+TEST_F(TestHandrolledEigvals, general_posdef_medium) {
+    // 1000 or bigger is way too slow.
+    run_general_posdef<double>( 512,  64, 0);
+    run_general_posdef<double>( 512,  64, 1);
+}
+
diff --git a/test/test_matmul_wrappers/test_sketch_symmetric.cc b/test/test_matmul_wrappers/test_sketch_symmetric.cc
index 04298ae8..47594f2a 100644
--- a/test/test_matmul_wrappers/test_sketch_symmetric.cc
+++ b/test/test_matmul_wrappers/test_sketch_symmetric.cc
@@ -54,7 +54,7 @@ void random_symmetric_mat(int64_t n, T* A, int64_t lda, STATE s) {
     // whose entries in the upper triangle are iid, then symmetrizing that matrix, then
     // zeroing out all entries outside the leading principal submatrix of order n.
     RandBLAS::fill_dense(Layout::ColMajor, {lda, lda}, n, n, 0, 0, A, s);
-    RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, A, n, lda);
+    RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, A, lda);
     return;
 }
 

From 3378c9d3ba41d9d59769001bcc961a51fddbbc9b Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 9 Aug 2024 16:13:09 -0400
Subject: [PATCH 27/37] power method is FINE

---
 test/handrolled_lapack.hh      | 195 +++++++++++++++++++++++----------
 test/test_handrolled_lapack.cc | 121 ++++++++++++++++++--
 2 files changed, 251 insertions(+), 65 deletions(-)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 859fcff3..af5812ac 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -2,6 +2,7 @@
 #pragma once
 
 #include "RandBLAS/util.hh"
+#include "RandBLAS/dense_skops.hh"
 
 #include <blas.hh>
 #include <iostream>
@@ -11,14 +12,15 @@
 namespace hr_lapack {
 
 template <typename T>
-void potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
+int potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
     // Cache access is much better if the matrix is lower triangular.
     // Could implement as lower triangular and then call transpose_square.
     for (int64_t j = 0; j < n; ++j) {
         if (A[j + j * lda] <= 0) {
-            std::stringstream s;
-            s << "Cholesky failed at index " << j << " of " << n << ".";
-            throw std::runtime_error(s.str());
+            // std::stringstream s;
+            std::cout << "Cholesky failed at index " << j << " of " << n << ".";
+            // throw std::runtime_error(s.str());
+            return j+1;
         }
         A[j + j * lda] = std::sqrt(A[j + j * lda]);
         for (int64_t i = j + 1; i < n; ++i) {
@@ -30,17 +32,22 @@ void potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
             }
         }
     }
+    return 0;
 }
 
 template <typename T>
-void potrf_upper(int64_t n, T* A, int64_t lda, int64_t b = 64) {
+int potrf_upper(int64_t n, T* A, int64_t lda, int64_t b = 64) {
     randblas_require(b > 0);
     auto layout = blas::Layout::ColMajor;
     auto uplo = blas::Uplo::Upper;
     int64_t curr_b = std::min(b, n);
     //  A = [A11, A12]
     //      [*  , A22]
-    potrf_upper_sequential(curr_b, A, lda);
+    int code = potrf_upper_sequential(curr_b, A, lda);
+    if (code != 0) {
+        std::cout << "Matrix indefinite. Returning early from potrf_upper.";
+        return code;
+    }
     //  A = [R11, A12]
     //      [*  , A22]
     if (curr_b < n) {
@@ -56,7 +63,7 @@ void potrf_upper(int64_t n, T* A, int64_t lda, int64_t b = 64) {
         );
         potrf_upper(n-curr_b, A22, lda, b);
     }
-    return;
+    return 0;
 }
 
 // If twice=true, then R will also be used as workspace, and must have length at least 2*n*n.
@@ -170,53 +177,6 @@ bool extremal_eigvals_converged_gershgorin(int64_t n, T* G, T tol) {
     return converged;
 }
 
-/**
- * Use QR iteration to compute all eigenvalues of a positive definite matrix A.
- * Run for at most "max_iters" iteration.
- * 
- * In each iteration we'll perform some non-standard work to estimate the accuracy of
- * the eigenvalues. To explain, suppose "R" is our current triangular factor. We'll 
- * compute the Gram matrix G = R'R. The eigenvalues of G are the squares of the
- * singular values of R, and the singular values of R are equal to the eigenvaleus of A.
- * Therefore our estimate for the eigenvalues of A will be the square root of diag(G). 
- * We can terminate the algorithm once the relative radii of G's Gershgorin discs
- * fall below reltol^2.
- *
- * 
- */
-template <typename T>
-int64_t posdef_eig_qr_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t max_iters, int64_t b = 8) {
-    b = std::min(b, n);
-
-    int64_t subroutine_work_size = n*n + std::max(n, 2*b) * b;
-    std::vector<T> work(n*n + subroutine_work_size);
-    T* subroutine_work = work.data();
-    T* G = subroutine_work;
-    T* R = subroutine_work + subroutine_work_size;
-
-    using blas::Op;
-    using blas::Layout;
-    using blas::Uplo;
-
-    T sq_reltol = reltol * reltol;
-    int64_t iter = 0;
-    bool converged = false;
-    for (; iter < max_iters; ++iter) {
-        qr_block_cgs2(n, n, A, R, work, b);
-        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, n, (T) 1.0, R, n, (T) 0.0, G, n);
-        RandBLAS::util::symmetrize(Layout::ColMajor, Uplo::Upper, n, G, n);
-        for (int64_t i = 0; i < n; ++i)
-            eigvals[i] = std::sqrt(G[i * n + i]);
-        converged = extremal_eigvals_converged_gershgorin(n, G, sq_reltol);
-        if (converged)
-            break;
-        // Update A = R * Q
-        blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, (T) 1.0, R, n, A, n, (T) 0.0, G, n);
-        blas::copy(n * n, G, 1, A, 1);
-    }
-    return (converged) ? iter : -iter;
-}
-
 /**
  * Use Cholesky iteration to compute all eigenvalues of a positive definite matrix A.
  * Run for at most "max_iters" iteration.
@@ -232,9 +192,11 @@ int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t
     using blas::Op;
     using blas::Layout;
     using blas::Uplo;
-
+    // int64_t n, const T* A, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state
     int64_t iter = 0;
     bool converged = false;
+    RandBLAS::RNGState state(1234567);
+    std::vector<int64_t> pivots(n, 0);
     for (; iter < max_iters; ++iter) {
         potrf_upper(n, A, n, b);
         RandBLAS::util::overwrite_triangle(Layout::ColMajor, Uplo::Lower, n, 1, (T) 0.0, A, n);
@@ -250,4 +212,127 @@ int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t
     return (converged) ? iter : -iter;
 }
 
+template <typename T, typename FUNC, typename RNG>
+T power_method(int64_t n, FUNC &A, T* v, T tol, RandBLAS::RNGState<RNG> state) {
+    RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, v, state);
+    std::vector<T> work(n, 0.0);
+    T* u = work.data();
+    T norm = blas::nrm2(n, v, 1);
+    blas::scal(n, (T)1.0/norm, v, 1);
+    T lambda = 0.0;
+    T pi = 4*std::atan(1.0);
+    int64_t max_iter = (int64_t) std::ceil(( 1.0 + std::log(std::sqrt(pi * (T)n)) )/ tol );
+    for (int64_t iter = 0; iter < max_iter; ++iter) {
+        A(v, u);
+        lambda = blas::dot(n, v, 1, u, 1);
+        blas::copy(n, u, 1, v, 1);
+        norm = blas::nrm2(n, v, 1);
+        blas::scal(n, (T)1.0/norm, v, 1);
+    }
+    return lambda;
+}
+
+/**
+ * @brief Lanczos Algorithm to find the largest eigenvalue of a symmetric positive definite matrix.
+ *
+ * @tparam scalar_t a real scalar type.
+ * @param A a callable, where an evaluation of the form A(x, y) overwrites y = A*x.
+ * @param n The order of the matrix (number of rows and columns).
+ * @param m The number of Lanczos iterations to perform.
+ * @param eigenvector Pointer to the output vector that will store the approximated eigenvector corresponding to the largest eigenvalue.
+ * @param max_iter The maximum number of iterations for the power method used to compute the largest eigenvalue of the tridiagonal matrix (default is 1000).
+ * @param tol The tolerance for convergence of the power method (default is 1e-6).
+ * @return The approximated largest eigenvalue of the matrix.
+ * 
+ * @endcode
+ */
+template <typename real_t, typename FUNC>
+std::pair<real_t,int64_t> lanczos(int64_t n, FUNC &A, int64_t lan_iter, int64_t pow_iter, real_t tol = 1e-6) {
+    // Allocate workspace
+    std::vector<real_t> workspace(4 * n + 4 * lan_iter + 1 + n*(lan_iter+1), 0.0);
+    real_t* u     = workspace.data();
+    real_t* v     = u + n;
+    real_t* w     = v + n;
+    real_t* alpha = w + n;
+    real_t* beta  = alpha + lan_iter;
+    real_t* b0    = beta  + lan_iter + 1;
+    real_t* b1    = b0    + lan_iter;
+    real_t* Q     = b1    + lan_iter;
+
+    auto normalize = [](int64_t k, real_t* vec) {
+        real_t scale = ((real_t) 1.0) / blas::nrm2(k, vec, 1);
+        blas::scal(k, scale, vec, 1);
+    };
+
+    RandBLAS::RNGState state(8739);
+    auto next_state = RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, w, state);
+    normalize(n, w);
+    blas::copy(n, w, 1, Q, 1);
+    A(w, v); // v =  Aw
+    alpha[0] = blas::dot(n, w, 1, v, 1); // alpha[0] = w'v
+    for (int i = 0; i < n; ++i) { v[i] -= alpha[0]*w[i]; } // v = v - alpha[0] w
+    beta[0] = blas::nrm2(n, v, 1); // beta[0] = ||v||
+    int64_t k = 0;
+    while (k < lan_iter && beta[k] >= tol) {
+        for (int i = 0; i < n; ++i) { real_t t = w[i];  w[i] = v[i] / beta[k]; v[i] = -beta[k] * t; }
+        A(w, u);
+        for (int i = 0; i < n; ++i) { v[i] += u[i]; }
+        k = k + 1;
+        alpha[k] = blas::dot(n, w, 1, v, 1);
+        for (int i = 0; i < n; ++i) { v[i] -= alpha[k]*w[i]; }
+        //
+        // Update the Lanczos vectors; using complete reorthogonalization with modified Gram-Schmidt
+        //
+        real_t* q_k = Q + (k-1)*n;
+        blas::copy(n, w, 1, q_k, 1);
+        real_t norm_w = blas::nrm2(n, q_k, 1);
+        blas::scal(n, (real_t)1.0 / norm_w, q_k, 1);
+        // update q_k = (I - Q_k Q_k') q_k, where Q_k = Q[:, 1:(k-1)]
+        //      u    = Q_k' q_k
+        //      q_k -= Q_k u
+        blas::gemv(blas::Layout::ColMajor,   blas::Op::Trans, n, k-1, (real_t)  1.0, Q, n, q_k, 1, (real_t) 0.0,   u, 1);
+        blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, n, k-1, (real_t) -1.0, Q, n,   u, 1, (real_t) 1.0, q_k, 1);
+        // update v = (I - Q_{k+1} Q_{k+1}') v, which should do nothing in exact arithmetic.
+        blas::gemv(blas::Layout::ColMajor,   blas::Op::Trans, n, k,  1.0, Q, n, v, 1, (real_t) 0.0, u, 1);
+        blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, n, k, -1.0, Q, n, u, 1, (real_t) 1.0, v, 1);
+        beta[k] = blas::nrm2(n, v, 1);
+    }
+    lan_iter = k;
+
+    auto T_func = [lan_iter, alpha, beta](const real_t* x, real_t* y) {
+        // Apply the tridiagonal matrix defined by (alpha, beta) to
+        // the vector x and store the result in y.
+        std::fill(y, y + lan_iter, 0.0);
+        for (int64_t i = 0; i < lan_iter; ++i) {
+            y[i] += alpha[i] * x[i];
+            if (i > 0) {
+                y[i] += beta[i - 1] * x[i - 1];
+            }
+            if (i < lan_iter - 1) {
+                y[i] += beta[i] * x[i + 1];
+            }
+        }
+        return;
+    };
+
+    // Compute the largest lambda of the tridiagonal matrix T
+    // For simplicity, we use the power method on T
+    RandBLAS::fill_dense(blas::Layout::ColMajor, {lan_iter, 1}, lan_iter, 1, 0, 0, b0, next_state);
+    normalize(lan_iter, b0);
+    real_t lambda = 0.0;
+    int64_t iter = 0;
+    for (; iter < pow_iter; ++iter) {
+        T_func(b0, b1);
+        real_t lambda_next = blas::dot(lan_iter, b0, 1, b1, 1);
+        if (std::abs(lambda_next - lambda) < tol) {
+            lambda = lambda_next;
+            break;
+        }
+        lambda = lambda_next;
+        blas::copy(lan_iter, b1, 1, b0, 1);
+        normalize(lan_iter, b0);
+    }
+    return {lambda, iter};
+}
+
 }
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index 081701ee..fd784af8 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -289,7 +289,7 @@ class TestHandrolledEigvals : public ::testing::Test {
     void run_general_posdef(int n, int b, uint32_t key) {
         std::vector<T> eigvals_expect(n);
         for (int i = 0; i < n; ++i) {
-            eigvals_expect[i] = 1.0 / (1 + (T)i);
+            eigvals_expect[i] = 2.0 / (1 + (T)i);
         }
         auto A = posdef_with_random_eigvecs(eigvals_expect, key);
         std::vector<T> eigvals_actual(n);
@@ -305,23 +305,124 @@ class TestHandrolledEigvals : public ::testing::Test {
         test::comparison::approx_equal(max_eig_actual, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
         return;
     }
+
+    template <typename T>
+    void run_lanczos_general_posdef(int n, uint32_t key) {
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 2.0 / std::sqrt((1 + (T)i));
+        }
+        auto _A = posdef_with_random_eigvecs(eigvals_expect, key);
+
+        T* A_buff = _A.data();
+        hr_lapack::potrf_upper(n, A_buff, n);
+        auto layout = blas::Layout::ColMajor;
+        auto uplo = blas::Uplo::Upper;
+        auto diag = blas::Diag::NonUnit;
+        auto A = [layout, uplo, diag, A_buff, n](const T* x, T* y) {
+            blas::copy(n, x, 1, y, 1);
+            blas::trmv(layout, uplo, blas::Op::NoTrans, diag, n, A_buff, n, y, 1);
+            blas::trmv(layout, uplo, blas::Op::Trans,   diag, n, A_buff, n, y, 1);
+            return;
+        };
+        auto invA = [layout, uplo, diag, A_buff, n](const T* x, T* y) {
+            blas::copy(n, x, 1, y, 1);
+            blas::trsv(layout, uplo, blas::Op::Trans,   diag, n, A_buff, n, y, 1);
+            blas::trsv(layout, uplo, blas::Op::NoTrans, diag, n, A_buff, n, y, 1);
+            return;
+        };
+
+        int64_t lan_iter = n;
+        int64_t pow_iter = 5*n;
+        auto [lambda_max, iters0] = hr_lapack::lanczos<T>(n,    A, lan_iter, pow_iter);
+        auto [lambda_min, iters1] = hr_lapack::lanczos<T>(n, invA, lan_iter, pow_iter);
+        lambda_min = 1.0/lambda_min;
+
+        T tol = 1e-3;
+        std::cout << "Iters for max: " << iters0 << ". Iters for min: " << iters1 << ".\n";
+        std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
+        std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;
+        test::comparison::approx_equal(lambda_min, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        test::comparison::approx_equal(lambda_max, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
+
+    template <typename T>
+    void run_power_general_posdef(int n, uint32_t key) {
+        std::vector<T> eigvals_expect(n);
+        for (int i = 0; i < n; ++i) {
+            eigvals_expect[i] = 2.0 / std::sqrt((1 + (T)i));
+        }
+        auto _A = posdef_with_random_eigvecs(eigvals_expect, key);
+
+        std::vector<T> A_copy(_A);
+        T* A_buff = A_copy.data();
+        auto layout = blas::Layout::ColMajor;
+        auto A = [layout, A_buff, n](const T* x, T* y) {
+            blas::gemv(layout, blas::Op::NoTrans, n, n, 1.0, A_buff, n, x, 1, 0.0, y, 1);
+            return;
+        };
+
+        auto uplo = blas::Uplo::Upper;
+        auto diag = blas::Diag::NonUnit;
+        hr_lapack::potrf_upper(n, _A.data(), n); // A = R'R
+        std::vector<T> _invA(n*n, 0.0);
+        for (int i = 0; i < n; ++i)
+            _invA[i + i*n] = 1.0;
+        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans, diag, n, n, 1.0, _A.data(), n, _invA.data(), n);
+        // ^ since we applied the inverse of a transposed upper-triangular, we have a lower-triangular result.
+        //   need to zero out the upper triangle for the next step.
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Upper, n, 1, 0.0, _invA.data(), n);
+        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, 1.0, _A.data(), n, _invA.data(), n);
+        T* invA_buff = _invA.data();
+        auto invA = [layout, invA_buff, n](const T* x, T* y) {
+            blas::gemv(layout, blas::Op::NoTrans, n, n, 1.0, invA_buff, n, x, 1, 0.0, y, 1);
+            return;
+        };
+
+        std::vector<T> work(n, 0.0);
+        T tol = 1e-3;
+        RandBLAS::RNGState state(key + 1);
+        auto lambda_max = hr_lapack::power_method<T>(n,    A, work.data(), tol, state);
+        auto lambda_min = hr_lapack::power_method<T>(n, invA, work.data(), tol, state);
+        lambda_min = 1.0/lambda_min;
+
+        std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
+        std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;
+        test::comparison::approx_equal(lambda_min, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        test::comparison::approx_equal(lambda_max, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
+        return;
+    }
 };
 
+TEST_F(TestHandrolledEigvals, power_smallish) {
+    run_power_general_posdef<double>(10, 0);
+    run_power_general_posdef<double>(50, 0);
+    run_power_general_posdef<double>(100, 0);
+}
+
 TEST_F(TestHandrolledEigvals, diag) {
-    run_diag<double>(5, 2);
-    run_diag<double>(10, 5);
-    run_diag<double>(100, 10);
+    run_diag<double>(5, 1);
+    run_diag<double>(10, 1);
+    run_diag<double>(100, 1);
 }
 
 TEST_F(TestHandrolledEigvals, general_posdef_smallish) {
-    run_general_posdef<double>(  5,  2, 0);
-    run_general_posdef<double>( 10,  2, 0);
-    run_general_posdef<double>(100, 10, 0);
+    run_general_posdef<double>(  5,  1, 0);
+    run_general_posdef<double>( 10,  1, 0);
+    run_general_posdef<double>(100,  1, 0);
 }
 
-TEST_F(TestHandrolledEigvals, general_posdef_medium) {
+TEST_F(TestHandrolledEigvals, power_medium) {
     // 1000 or bigger is way too slow.
-    run_general_posdef<double>( 512,  64, 0);
-    run_general_posdef<double>( 512,  64, 1);
+    run_power_general_posdef<double>( 512, 0);
+    run_power_general_posdef<double>( 512, 1);
 }
 
+TEST_F(TestHandrolledEigvals, power_largish) {
+    // 1000 or bigger is way too slow.
+    run_power_general_posdef<double>( 1024, 0);
+    run_power_general_posdef<double>( 1024, 1);
+}
+
+

From 7648683a7cd5c712c45fb3cc0ab3e7644a8019fb Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 9 Aug 2024 16:26:12 -0400
Subject: [PATCH 28/37] remove awful lanczos implementaiton. Power method
 suffices in our context.

---
 RandBLAS/util.hh               |   3 +-
 test/handrolled_lapack.hh      | 109 +--------------------------------
 test/test_handrolled_lapack.cc |  67 +++-----------------
 3 files changed, 11 insertions(+), 168 deletions(-)

diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index 5ad91711..111a9e5a 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -59,8 +59,7 @@ void safe_scal(int64_t n, T a, T* x, int64_t inc_x) {
 }
 
 template <typename T>
-void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, const char label[])
-{
+void print_colmaj(int64_t n_rows, int64_t n_cols, T *a, const char label[]) {
 	int64_t i, j;
     T val;
 	std::cout << "\n" << label << std::endl;
diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index af5812ac..1c47aee6 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -17,9 +17,7 @@ int potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
     // Could implement as lower triangular and then call transpose_square.
     for (int64_t j = 0; j < n; ++j) {
         if (A[j + j * lda] <= 0) {
-            // std::stringstream s;
-            std::cout << "Cholesky failed at index " << j << " of " << n << ".";
-            // throw std::runtime_error(s.str());
+            std::cout << "Cholesky failed at index " << (j+1) << " of " << n << ".";
             return j+1;
         }
         A[j + j * lda] = std::sqrt(A[j + j * lda]);
@@ -121,7 +119,6 @@ void qr_block_cgs(int64_t m, int64_t n, T* A, T* R, int64_t ldr, T* work, int64_
     return;
 }
 
-
 // We'll resize bigwork to be length at least (n*n + max(n, 2*b) * b).
 template <typename T>
 void qr_block_cgs2(int64_t m, int64_t n, T* A, T* R, std::vector<T> &bigwork, int64_t b = 64) {
@@ -192,7 +189,6 @@ int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t
     using blas::Op;
     using blas::Layout;
     using blas::Uplo;
-    // int64_t n, const T* A, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state
     int64_t iter = 0;
     bool converged = false;
     RandBLAS::RNGState state(1234567);
@@ -232,107 +228,4 @@ T power_method(int64_t n, FUNC &A, T* v, T tol, RandBLAS::RNGState<RNG> state) {
     return lambda;
 }
 
-/**
- * @brief Lanczos Algorithm to find the largest eigenvalue of a symmetric positive definite matrix.
- *
- * @tparam scalar_t a real scalar type.
- * @param A a callable, where an evaluation of the form A(x, y) overwrites y = A*x.
- * @param n The order of the matrix (number of rows and columns).
- * @param m The number of Lanczos iterations to perform.
- * @param eigenvector Pointer to the output vector that will store the approximated eigenvector corresponding to the largest eigenvalue.
- * @param max_iter The maximum number of iterations for the power method used to compute the largest eigenvalue of the tridiagonal matrix (default is 1000).
- * @param tol The tolerance for convergence of the power method (default is 1e-6).
- * @return The approximated largest eigenvalue of the matrix.
- * 
- * @endcode
- */
-template <typename real_t, typename FUNC>
-std::pair<real_t,int64_t> lanczos(int64_t n, FUNC &A, int64_t lan_iter, int64_t pow_iter, real_t tol = 1e-6) {
-    // Allocate workspace
-    std::vector<real_t> workspace(4 * n + 4 * lan_iter + 1 + n*(lan_iter+1), 0.0);
-    real_t* u     = workspace.data();
-    real_t* v     = u + n;
-    real_t* w     = v + n;
-    real_t* alpha = w + n;
-    real_t* beta  = alpha + lan_iter;
-    real_t* b0    = beta  + lan_iter + 1;
-    real_t* b1    = b0    + lan_iter;
-    real_t* Q     = b1    + lan_iter;
-
-    auto normalize = [](int64_t k, real_t* vec) {
-        real_t scale = ((real_t) 1.0) / blas::nrm2(k, vec, 1);
-        blas::scal(k, scale, vec, 1);
-    };
-
-    RandBLAS::RNGState state(8739);
-    auto next_state = RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, w, state);
-    normalize(n, w);
-    blas::copy(n, w, 1, Q, 1);
-    A(w, v); // v =  Aw
-    alpha[0] = blas::dot(n, w, 1, v, 1); // alpha[0] = w'v
-    for (int i = 0; i < n; ++i) { v[i] -= alpha[0]*w[i]; } // v = v - alpha[0] w
-    beta[0] = blas::nrm2(n, v, 1); // beta[0] = ||v||
-    int64_t k = 0;
-    while (k < lan_iter && beta[k] >= tol) {
-        for (int i = 0; i < n; ++i) { real_t t = w[i];  w[i] = v[i] / beta[k]; v[i] = -beta[k] * t; }
-        A(w, u);
-        for (int i = 0; i < n; ++i) { v[i] += u[i]; }
-        k = k + 1;
-        alpha[k] = blas::dot(n, w, 1, v, 1);
-        for (int i = 0; i < n; ++i) { v[i] -= alpha[k]*w[i]; }
-        //
-        // Update the Lanczos vectors; using complete reorthogonalization with modified Gram-Schmidt
-        //
-        real_t* q_k = Q + (k-1)*n;
-        blas::copy(n, w, 1, q_k, 1);
-        real_t norm_w = blas::nrm2(n, q_k, 1);
-        blas::scal(n, (real_t)1.0 / norm_w, q_k, 1);
-        // update q_k = (I - Q_k Q_k') q_k, where Q_k = Q[:, 1:(k-1)]
-        //      u    = Q_k' q_k
-        //      q_k -= Q_k u
-        blas::gemv(blas::Layout::ColMajor,   blas::Op::Trans, n, k-1, (real_t)  1.0, Q, n, q_k, 1, (real_t) 0.0,   u, 1);
-        blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, n, k-1, (real_t) -1.0, Q, n,   u, 1, (real_t) 1.0, q_k, 1);
-        // update v = (I - Q_{k+1} Q_{k+1}') v, which should do nothing in exact arithmetic.
-        blas::gemv(blas::Layout::ColMajor,   blas::Op::Trans, n, k,  1.0, Q, n, v, 1, (real_t) 0.0, u, 1);
-        blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, n, k, -1.0, Q, n, u, 1, (real_t) 1.0, v, 1);
-        beta[k] = blas::nrm2(n, v, 1);
-    }
-    lan_iter = k;
-
-    auto T_func = [lan_iter, alpha, beta](const real_t* x, real_t* y) {
-        // Apply the tridiagonal matrix defined by (alpha, beta) to
-        // the vector x and store the result in y.
-        std::fill(y, y + lan_iter, 0.0);
-        for (int64_t i = 0; i < lan_iter; ++i) {
-            y[i] += alpha[i] * x[i];
-            if (i > 0) {
-                y[i] += beta[i - 1] * x[i - 1];
-            }
-            if (i < lan_iter - 1) {
-                y[i] += beta[i] * x[i + 1];
-            }
-        }
-        return;
-    };
-
-    // Compute the largest lambda of the tridiagonal matrix T
-    // For simplicity, we use the power method on T
-    RandBLAS::fill_dense(blas::Layout::ColMajor, {lan_iter, 1}, lan_iter, 1, 0, 0, b0, next_state);
-    normalize(lan_iter, b0);
-    real_t lambda = 0.0;
-    int64_t iter = 0;
-    for (; iter < pow_iter; ++iter) {
-        T_func(b0, b1);
-        real_t lambda_next = blas::dot(lan_iter, b0, 1, b1, 1);
-        if (std::abs(lambda_next - lambda) < tol) {
-            lambda = lambda_next;
-            break;
-        }
-        lambda = lambda_next;
-        blas::copy(lan_iter, b1, 1, b0, 1);
-        normalize(lan_iter, b0);
-    }
-    return {lambda, iter};
-}
-
 }
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index fd784af8..6f8a672c 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -306,47 +306,6 @@ class TestHandrolledEigvals : public ::testing::Test {
         return;
     }
 
-    template <typename T>
-    void run_lanczos_general_posdef(int n, uint32_t key) {
-        std::vector<T> eigvals_expect(n);
-        for (int i = 0; i < n; ++i) {
-            eigvals_expect[i] = 2.0 / std::sqrt((1 + (T)i));
-        }
-        auto _A = posdef_with_random_eigvecs(eigvals_expect, key);
-
-        T* A_buff = _A.data();
-        hr_lapack::potrf_upper(n, A_buff, n);
-        auto layout = blas::Layout::ColMajor;
-        auto uplo = blas::Uplo::Upper;
-        auto diag = blas::Diag::NonUnit;
-        auto A = [layout, uplo, diag, A_buff, n](const T* x, T* y) {
-            blas::copy(n, x, 1, y, 1);
-            blas::trmv(layout, uplo, blas::Op::NoTrans, diag, n, A_buff, n, y, 1);
-            blas::trmv(layout, uplo, blas::Op::Trans,   diag, n, A_buff, n, y, 1);
-            return;
-        };
-        auto invA = [layout, uplo, diag, A_buff, n](const T* x, T* y) {
-            blas::copy(n, x, 1, y, 1);
-            blas::trsv(layout, uplo, blas::Op::Trans,   diag, n, A_buff, n, y, 1);
-            blas::trsv(layout, uplo, blas::Op::NoTrans, diag, n, A_buff, n, y, 1);
-            return;
-        };
-
-        int64_t lan_iter = n;
-        int64_t pow_iter = 5*n;
-        auto [lambda_max, iters0] = hr_lapack::lanczos<T>(n,    A, lan_iter, pow_iter);
-        auto [lambda_min, iters1] = hr_lapack::lanczos<T>(n, invA, lan_iter, pow_iter);
-        lambda_min = 1.0/lambda_min;
-
-        T tol = 1e-3;
-        std::cout << "Iters for max: " << iters0 << ". Iters for min: " << iters1 << ".\n";
-        std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
-        std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;
-        test::comparison::approx_equal(lambda_min, eigvals_expect[n-1], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
-        test::comparison::approx_equal(lambda_max, eigvals_expect[0  ], __PRETTY_FUNCTION__, __FILE__, __LINE__,  tol, tol);
-        return;
-    }
-
     template <typename T>
     void run_power_general_posdef(int n, uint32_t key) {
         std::vector<T> eigvals_expect(n);
@@ -359,7 +318,7 @@ class TestHandrolledEigvals : public ::testing::Test {
         T* A_buff = A_copy.data();
         auto layout = blas::Layout::ColMajor;
         auto A = [layout, A_buff, n](const T* x, T* y) {
-            blas::gemv(layout, blas::Op::NoTrans, n, n, 1.0, A_buff, n, x, 1, 0.0, y, 1);
+            blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, A_buff, n, x, 1, (T) 0.0, y, 1);
             return;
         };
 
@@ -369,14 +328,14 @@ class TestHandrolledEigvals : public ::testing::Test {
         std::vector<T> _invA(n*n, 0.0);
         for (int i = 0; i < n; ++i)
             _invA[i + i*n] = 1.0;
-        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans, diag, n, n, 1.0, _A.data(), n, _invA.data(), n);
+        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans, diag, n, n, (T) 1.0, _A.data(), n, _invA.data(), n);
         // ^ since we applied the inverse of a transposed upper-triangular, we have a lower-triangular result.
         //   need to zero out the upper triangle for the next step.
-        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Upper, n, 1, 0.0, _invA.data(), n);
-        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, 1.0, _A.data(), n, _invA.data(), n);
+        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Upper, n, 1, (T) 0.0, _invA.data(), n);
+        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, (T) 1.0, _A.data(), n, _invA.data(), n);
         T* invA_buff = _invA.data();
         auto invA = [layout, invA_buff, n](const T* x, T* y) {
-            blas::gemv(layout, blas::Op::NoTrans, n, n, 1.0, invA_buff, n, x, 1, 0.0, y, 1);
+            blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, invA_buff, n, x, 1, (T) 0.0, y, 1);
             return;
         };
 
@@ -407,22 +366,14 @@ TEST_F(TestHandrolledEigvals, diag) {
     run_diag<double>(100, 1);
 }
 
-TEST_F(TestHandrolledEigvals, general_posdef_smallish) {
-    run_general_posdef<double>(  5,  1, 0);
-    run_general_posdef<double>( 10,  1, 0);
-    run_general_posdef<double>(100,  1, 0);
-}
-
 TEST_F(TestHandrolledEigvals, power_medium) {
-    // 1000 or bigger is way too slow.
-    run_power_general_posdef<double>( 512, 0);
-    run_power_general_posdef<double>( 512, 1);
+    run_power_general_posdef<double>( 512, 0 );
+    run_power_general_posdef<double>( 512, 1 );
 }
 
 TEST_F(TestHandrolledEigvals, power_largish) {
-    // 1000 or bigger is way too slow.
-    run_power_general_posdef<double>( 1024, 0);
-    run_power_general_posdef<double>( 1024, 1);
+    run_power_general_posdef<float>( 1024, 0 );
+    run_power_general_posdef<float>( 1024, 1 );
 }
 
 

From c39d27dbcfc3fc5250a0a1f349ea7729c4fcdb03 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 9 Aug 2024 17:22:41 -0400
Subject: [PATCH 29/37] Make a convenient function for computing extremal
 eigenvalues of a positive definite matrix by power method with explicit
 inversion

---
 test/handrolled_lapack.hh      | 52 ++++++++++++++++++++++++++++++++--
 test/test_handrolled_lapack.cc | 38 ++++---------------------
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 1c47aee6..96405877 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -11,6 +11,8 @@
 
 namespace hr_lapack {
 
+using RandBLAS::RNGState;
+
 template <typename T>
 int potrf_upper_sequential(int64_t n, T* A, int64_t lda) {
     // Cache access is much better if the matrix is lower triangular.
@@ -209,8 +211,8 @@ int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t
 }
 
 template <typename T, typename FUNC, typename RNG>
-T power_method(int64_t n, FUNC &A, T* v, T tol, RandBLAS::RNGState<RNG> state) {
-    RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, v, state);
+std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, const RNGState<RNG> &state) {
+    auto next_state = RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, v, state);
     std::vector<T> work(n, 0.0);
     T* u = work.data();
     T norm = blas::nrm2(n, v, 1);
@@ -225,7 +227,51 @@ T power_method(int64_t n, FUNC &A, T* v, T tol, RandBLAS::RNGState<RNG> state) {
         norm = blas::nrm2(n, v, 1);
         blas::scal(n, (T)1.0/norm, v, 1);
     }
-    return lambda;
+    return {lambda, next_state};
+}
+
+template <typename T, typename RNG>
+std::tuple<T, T, RNGState<RNG>> extremal_eigvals_powermethod(int64_t n, const T* A, T* eigvecs, T tol, const RNGState<RNG> &state,  std::vector<T> work) {
+    auto layout = blas::Layout::ColMajor;
+    RandBLAS::util::require_symmetric(layout, A, n, n, (T) 0.0);
+
+    // Compute the dominant eigenpair. Nothing fancy here.
+    auto A_func = [layout, A, n](const T* x, T* y) {
+        blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, A, n, x, 1, (T) 0.0, y, 1);
+    };
+    auto [lambda_max, next_state] = power_method<T>(n, A_func, eigvecs, tol, state);
+
+    // To compute the smallest eigenpair we'll explicitly invert A. This requires
+    // 2n^2 workspace: n^2 workspace for Cholesky of A (since we don't want to destroy A)
+    // and another n^2 workspace for TRSMs with the Cholesky factor to get invA.
+    //
+    // Note: we *could* use less workspace if we were willing to access invA as a linear
+    // operator using two calls to TRSV when needed. But this ends up being much slower
+    // than explicit inversion for the values of (n, tol) that we care about.
+    //
+    if ((int64_t) work.size() < 2*n*n)
+        work.resize(2*n*n);
+    T* chol = work.data();
+    blas::copy(n*n, A, 1, chol, 1);
+    potrf_upper(n, chol, n);
+    T* invA = chol + n*n;
+    std::fill(invA, invA + n*n, 0.0);
+    for (int i = 0; i < n; ++i)
+        invA[i + i*n] = 1.0;
+    auto uplo   = blas::Uplo::Upper;
+    auto diag   = blas::Diag::NonUnit;
+    blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans,   diag, n, n, (T) 1.0, chol, n, invA, n);
+    blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, (T) 1.0, chol, n, invA, n);
+
+    // Now that we have invA explicitly, getting its dominant eigenpair is effortless.
+    auto invA_func = [layout, invA, n](const T* x, T* y) {
+        blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, invA, n, x, 1, (T) 0.0, y, 1);
+        return;
+    };
+    auto [lambda_min, final_state] = power_method<T>(n, invA_func, eigvecs + n, tol, next_state);
+    lambda_min = 1.0/lambda_min;
+
+    return {lambda_max, lambda_min, final_state};
 }
 
 }
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index 6f8a672c..3b4d449b 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -296,7 +296,7 @@ class TestHandrolledEigvals : public ::testing::Test {
         int64_t iters = 1000;
         T tol = 1e-3;
         auto iter = hr_lapack::posdef_eig_chol_iteration(n, A.data(), eigvals_actual.data(), tol, iters, b);
-        std::cout << "Number of QR iterations : " << iter << std::endl;
+        std::cout << "Number of iterations  : " << iter << std::endl;
         T min_eig_actual = *std::min_element(eigvals_actual.begin(), eigvals_actual.end());
         T max_eig_actual = *std::max_element(eigvals_actual.begin(), eigvals_actual.end());
         std::cout << "min_comp / min_actual " << min_eig_actual / eigvals_expect[n-1] << std::endl;
@@ -312,39 +312,13 @@ class TestHandrolledEigvals : public ::testing::Test {
         for (int i = 0; i < n; ++i) {
             eigvals_expect[i] = 2.0 / std::sqrt((1 + (T)i));
         }
-        auto _A = posdef_with_random_eigvecs(eigvals_expect, key);
+        auto A = posdef_with_random_eigvecs(eigvals_expect, key);
 
-        std::vector<T> A_copy(_A);
-        T* A_buff = A_copy.data();
-        auto layout = blas::Layout::ColMajor;
-        auto A = [layout, A_buff, n](const T* x, T* y) {
-            blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, A_buff, n, x, 1, (T) 0.0, y, 1);
-            return;
-        };
-
-        auto uplo = blas::Uplo::Upper;
-        auto diag = blas::Diag::NonUnit;
-        hr_lapack::potrf_upper(n, _A.data(), n); // A = R'R
-        std::vector<T> _invA(n*n, 0.0);
-        for (int i = 0; i < n; ++i)
-            _invA[i + i*n] = 1.0;
-        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::Trans, diag, n, n, (T) 1.0, _A.data(), n, _invA.data(), n);
-        // ^ since we applied the inverse of a transposed upper-triangular, we have a lower-triangular result.
-        //   need to zero out the upper triangle for the next step.
-        RandBLAS::util::overwrite_triangle(layout, blas::Uplo::Upper, n, 1, (T) 0.0, _invA.data(), n);
-        blas::trsm(layout, blas::Side::Left, uplo, blas::Op::NoTrans, diag, n, n, (T) 1.0, _A.data(), n, _invA.data(), n);
-        T* invA_buff = _invA.data();
-        auto invA = [layout, invA_buff, n](const T* x, T* y) {
-            blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, invA_buff, n, x, 1, (T) 0.0, y, 1);
-            return;
-        };
-
-        std::vector<T> work(n, 0.0);
+        std::vector<T> ourwork(2*n, 0.0);
+        std::vector<T> subwork{};
         T tol = 1e-3;
-        RandBLAS::RNGState state(key + 1);
-        auto lambda_max = hr_lapack::power_method<T>(n,    A, work.data(), tol, state);
-        auto lambda_min = hr_lapack::power_method<T>(n, invA, work.data(), tol, state);
-        lambda_min = 1.0/lambda_min;
+        RNGState state(key + 1);
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::extremal_eigvals_powermethod(n, A.data(), ourwork.data(), tol, state, subwork);
 
         std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
         std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;

From cec167841dcf66cc0e7016f4b3282089c3a5a197 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Sat, 10 Aug 2024 12:59:39 -0400
Subject: [PATCH 30/37] passing tests for distortion of Gaussian operators

---
 RandBLAS/dense_skops.hh                |  4 +-
 RandBLAS/util.hh                       |  2 +
 test/CMakeLists.txt                    |  4 +-
 test/handrolled_lapack.hh              |  2 +-
 test/test_basic_rng/test_distortion.cc | 56 ++++++++++++++++++--------
 test/test_handrolled_lapack.cc         |  4 +-
 6 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 887ed656..0ae9334f 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -501,7 +501,7 @@ struct DenseSkOp {
 ///      A CBRNG state
 ///      - Used to define \math{S} as a sample from \math{\D}.
 /// 
-template<typename T, typename RNG>
+template<typename T, typename RNG = r123::Philox4x32>
 RNGState<RNG> fill_dense(
     blas::Layout layout,
     const DenseDist &D,
@@ -588,7 +588,7 @@ RNGState<RNG> fill_dense(
 ///     - If this function returns a layout that is undesirable then it is
 ///       the caller's responsibility to perform a transpose as needed.
 /// 
-template <typename T, typename RNG>
+template <typename T, typename RNG = r123::Philox4x32>
 RNGState<RNG> fill_dense(
     const DenseDist &D,
     T *buff,
diff --git a/RandBLAS/util.hh b/RandBLAS/util.hh
index 111a9e5a..22242449 100644
--- a/RandBLAS/util.hh
+++ b/RandBLAS/util.hh
@@ -178,6 +178,8 @@ void require_symmetric(blas::Layout layout, const T* A, int64_t n, int64_t lda,
                 std::string message = "Symmetry check failed. |A(%i,%i) - A(%i,%i)| was %d, which exceeds tolerance of %d.";
                 auto _message = message.c_str();
                 randblas_error_if_msg(viol > rel_tol, _message, i, j, j, i, viol, rel_tol);
+                // ^ TODO: fix this macro. Apparently it doesn't print out all that I'd like. Example I just got:
+                //  "Symmetry check failed. |A(0,1) - A(1,0)| was 1610612736, which exceeds toleranc, in function require_symmetric" thrown in the test body.
             }
         }
     }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 18a719b1..e48cd660 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -55,11 +55,13 @@ if (GTest_FOUND)
         test_basic_rng/test_r123.cc
         test_basic_rng/test_discrete.cc
         test_basic_rng/test_continuous.cc
+        test_basic_rng/test_distortion.cc
     )
     target_link_libraries(RandBLAS_stats RandBLAS GTest::GTest GTest::Main)
     gtest_discover_tests(RandBLAS_stats)
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/..)
     file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../test/)
-    file(COPY test_basic_rng/r123_kat_vectors.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
 
     #####################################################################
     #
diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index 96405877..b4a1f3fd 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -231,7 +231,7 @@ std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, const
 }
 
 template <typename T, typename RNG>
-std::tuple<T, T, RNGState<RNG>> extremal_eigvals_powermethod(int64_t n, const T* A, T* eigvecs, T tol, const RNGState<RNG> &state,  std::vector<T> work) {
+std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eigvecs, T tol, const RNGState<RNG> &state,  std::vector<T> work) {
     auto layout = blas::Layout::ColMajor;
     RandBLAS::util::require_symmetric(layout, A, n, n, (T) 0.0);
 
diff --git a/test/test_basic_rng/test_distortion.cc b/test/test_basic_rng/test_distortion.cc
index a8fc1f02..84bae8b7 100644
--- a/test/test_basic_rng/test_distortion.cc
+++ b/test/test_basic_rng/test_distortion.cc
@@ -36,15 +36,12 @@ using RandBLAS::DenseDistName;
 using RandBLAS::RNGState;
 
 #include "rng_common.hh"
+#include "../handrolled_lapack.hh"
 
-#include <lapack.hh>
 #include <iostream>
 #include <vector>
 #include <gtest/gtest.h>
 
-// function [N_] = get_min_dimension(p_, tau_)
-//     N_ = ceil( ((sqrt(2*log(1 ./ p_)) + 1) ./ tau_).^2 );
-// end
 
 int64_t get_min_dimension(double p, double tau) {
     double val = std::sqrt(-2 * std::log(p)) + 1;
@@ -53,19 +50,11 @@ int64_t get_min_dimension(double p, double tau) {
     return (int64_t) std::ceil(val);
 }
 
-// TODO: implement Cholesky and Krylov subspace method for computing condition number.
-/*
-
-
-*/
-
 class TestSubspaceDistortion : public ::testing::Test {
     protected:
 
     template <typename T>
-    void run_gaussian(
-        T distortion, T tau, T p_fail_bound, uint32_t key
-    ) {
+    void run_gaussian(T distortion, T tau, T p_fail_bound, uint32_t key) {
         /**
          *  Generate a d-by-N Gaussian matrix, where d = gamma*N,
          *  gamma = (r/delta)^2, and N is the smallest integer where n > N implies
@@ -97,12 +86,47 @@ class TestSubspaceDistortion : public ::testing::Test {
          *  N = ceil( ( [sqrt(2*log(1/p)) + 1] / tau )^2 )
          *  
          */ 
+        auto layout = blas::Layout::ColMajor;
         int64_t N = get_min_dimension(p_fail_bound, tau);
-        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) );
+        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) * N );
         DenseDist D(d, N, DenseDistName::Gaussian);
         std::vector<T> S(d*N);
-        RandBLAS::fill_dense(D, S.data(), {key});
-        // need LAPACK ...
+        std::cout << "(d, N) = ( " << d << ", " << N << " )\n";
+        RandBLAS::RNGState<r123::Philox4x32> state(key);
+        auto next_state = RandBLAS::fill_dense(D, S.data(), state);
+        blas::scal(d*N, (T)1.0/std::sqrt(d), S.data(), 1);
+        std::vector<T> G(N*N, 0.0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, N, d, (T)1.0, S.data(), d, (T)0.0, G.data(), N);
+        RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, N, G.data(), N);
+        
+        std::vector<T> eigvecs(2*N, 0.0);
+        std::vector<T> subwork{};
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(N, G.data(), eigvecs.data(), (T) 1e-2, state, subwork);
+        T sigma_max = std::sqrt(lambda_max);
+        T sigma_min = std::sqrt(lambda_min);
+        ASSERT_LE(sigma_max, 1+distortion);
+        ASSERT_GE(sigma_min, 1-distortion);
         return;
     }
 };
+
+TEST_F(TestSubspaceDistortion, gaussian_float_tau_100_fail_00001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_gaussian<float>(0.50f, 1.0f, p_fail, key + i);
+        run_gaussian<float>(0.25f, 1.0f, p_fail, key + i);
+        run_gaussian<float>(0.10f, 1.0f, p_fail, key + i);
+    }
+}
+
+TEST_F(TestSubspaceDistortion, gaussian_float_tau_020_fail_00001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_gaussian<float>(0.75f, 0.2f, p_fail, key + i);
+        run_gaussian<float>(0.50f, 0.2f, p_fail, key + i);
+        run_gaussian<float>(0.25f, 0.2f, p_fail, key + i);
+    }
+}
+
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index 3b4d449b..ac040170 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -318,7 +318,7 @@ class TestHandrolledEigvals : public ::testing::Test {
         std::vector<T> subwork{};
         T tol = 1e-3;
         RNGState state(key + 1);
-        auto [lambda_max, lambda_min, ignore] = hr_lapack::extremal_eigvals_powermethod(n, A.data(), ourwork.data(), tol, state, subwork);
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, state, subwork);
 
         std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
         std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;
@@ -345,7 +345,7 @@ TEST_F(TestHandrolledEigvals, power_medium) {
     run_power_general_posdef<double>( 512, 1 );
 }
 
-TEST_F(TestHandrolledEigvals, power_largish) {
+TEST_F(TestHandrolledEigvals, power_largeish) {
     run_power_general_posdef<float>( 1024, 0 );
     run_power_general_posdef<float>( 1024, 1 );
 }

From de57a9d65324f9e41be5295903b22c1df430353c Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 19 Aug 2024 23:23:31 -0400
Subject: [PATCH 31/37] scale uniform to have variance 1. Need to change
 distributional tests, which currently assume uniform [-1,1].

---
 RandBLAS/dense_skops.hh                |  45 ++-------
 test/handrolled_lapack.hh              |  44 +++++++--
 test/test_basic_rng/test_distortion.cc | 125 +++++++++++++++----------
 test/test_handrolled_lapack.cc         |   2 +-
 4 files changed, 122 insertions(+), 94 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 0ae9334f..1442113b 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -47,7 +47,6 @@
 
 namespace RandBLAS::dense {
 
-
 template <typename T_IN, typename T_OUT>
 inline void copy_promote(int n, const T_IN &a, T_OUT* b) {
     for (int i = 0; i < n; ++i)
@@ -95,15 +94,7 @@ inline void copy_promote(int n, const T_IN &a, T_OUT* b) {
  * 
  */
 template<typename T, typename RNG, typename OP>
-static RNGState<RNG> fill_dense_submat_impl(
-    int64_t n_cols,
-    T* smat,
-    int64_t n_srows,
-    int64_t n_scols,
-    int64_t ptr,
-    const RNGState<RNG> & seed,
-    int64_t lda = 0
-) {
+static RNGState<RNG> fill_dense_submat_impl(int64_t n_cols, T* smat, int64_t n_srows, int64_t n_scols, int64_t ptr, const RNGState<RNG> &seed, int64_t lda = 0) {
     if (lda <= 0) {
         lda = n_scols;
     } else {
@@ -225,7 +216,6 @@ enum class DenseDistName : char {
     BlackBox = 'B'
 };
 
-
 // =============================================================================
 /// A distribution over dense sketching operators.
 struct DenseDist {
@@ -302,9 +292,7 @@ struct DenseDist {
 };
 
 
-inline blas::Layout dist_to_layout(
-    const DenseDist &D
-) {
+inline blas::Layout dist_to_layout(const DenseDist &D) {
     randblas_require(D.major_axis != MajorAxis::Undefined);
     bool is_wide = D.n_rows < D.n_cols;
     bool fa_long = D.major_axis == MajorAxis::Long;
@@ -319,9 +307,7 @@ inline blas::Layout dist_to_layout(
     }
 }
 
-inline int64_t major_axis_length(
-    const DenseDist &D
-) {
+inline int64_t major_axis_length(const DenseDist &D) {
     randblas_require(D.major_axis != MajorAxis::Undefined);
     return (D.major_axis == MajorAxis::Long) ? 
         std::max(D.n_rows, D.n_cols) : std::min(D.n_rows, D.n_cols);
@@ -334,14 +320,13 @@ inline T isometry_scale_factor(DenseDist D) {
         return common;
     } else if (D.family == DenseDistName::Uniform) {
         // the variance of an r.v. distributed Unif[-1, 1] is 4/12=1/3.
-        return std::sqrt(3)*common;
+        return common;
     } else {
         throw std::runtime_error("Unrecognized distribution.");
     }
 }
 
 
-
 // =============================================================================
 /// A sample from a distribution over dense sketching operators.
 ///
@@ -502,16 +487,7 @@ struct DenseSkOp {
 ///      - Used to define \math{S} as a sample from \math{\D}.
 /// 
 template<typename T, typename RNG = r123::Philox4x32>
-RNGState<RNG> fill_dense(
-    blas::Layout layout,
-    const DenseDist &D,
-    int64_t n_rows,
-    int64_t n_cols,
-    int64_t ro_s,
-    int64_t co_s,
-    T* buff,
-    const RNGState<RNG> &seed
-) {
+RNGState<RNG> fill_dense(blas::Layout layout, const DenseDist &D, int64_t n_rows, int64_t n_cols, int64_t ro_s, int64_t co_s, T* buff, const RNGState<RNG> &seed) {
     using RandBLAS::dense::fill_dense_submat_impl;
     randblas_require(D.n_rows >= n_rows + ro_s);
     randblas_require(D.n_cols >= n_cols + co_s);
@@ -536,6 +512,7 @@ RNGState<RNG> fill_dense(
         }
         case DenseDistName::Uniform: {
             next_state = fill_dense_submat_impl<T,RNG,r123ext::uneg11>(ma_len, buff, n_rows_, n_cols_, ptr, seed);
+            blas::scal(n_rows_ * n_cols_, (T)std::sqrt(3), buff, 1);
             break;
         }
         case DenseDistName::BlackBox: {
@@ -589,11 +566,7 @@ RNGState<RNG> fill_dense(
 ///       the caller's responsibility to perform a transpose as needed.
 /// 
 template <typename T, typename RNG = r123::Philox4x32>
-RNGState<RNG> fill_dense(
-    const DenseDist &D,
-    T *buff,
-    const RNGState<RNG> &seed
-) {
+RNGState<RNG> fill_dense(const DenseDist &D, T *buff, const RNGState<RNG> &seed) {
     return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed);
 }
 
@@ -611,9 +584,7 @@ RNGState<RNG> fill_dense(
 ///     A DenseSkOp object.
 ///    
 template <typename T, typename RNG>
-void fill_dense(
-    DenseSkOp<T,RNG> &S
-) {
+void fill_dense(DenseSkOp<T,RNG> &S) {
     randblas_require(S.buff == nullptr);
     randblas_require(S.dist.family != DenseDistName::BlackBox);
     S.buff = new T[S.dist.n_rows * S.dist.n_cols];
diff --git a/test/handrolled_lapack.hh b/test/handrolled_lapack.hh
index b4a1f3fd..9b2fbd7e 100644
--- a/test/handrolled_lapack.hh
+++ b/test/handrolled_lapack.hh
@@ -210,17 +210,36 @@ int64_t posdef_eig_chol_iteration(int64_t n, T* A, T* eigvals, T reltol, int64_t
     return (converged) ? iter : -iter;
 }
 
+template <typename T>
+inline int64_t required_powermethod_iters(int64_t n, T p_fail, T tol) {
+    T pi = 4*std::atan(1.0);
+    int64_t expectation_bound = (int64_t) std::ceil(( 1.0 + std::log(std::sqrt(pi * (T)n)) )/ tol );
+
+    T temp0 = 1 - tol;
+    T temp1 = std::log(1 / temp0);
+    T temp2 = tol * p_fail * p_fail;
+    int64_t probability_bound_1 = (int64_t) std::log(std::exp(1.) +  (T)0.27 * temp0 * temp1 / temp2) / temp1;
+    int64_t probability_bound_2 = (int64_t) std::log(std::sqrt(n) / p_fail) / temp1;
+    int64_t probability_bound   = std::min(probability_bound_1, probability_bound_2);
+
+    // std::cout << "(n, p, eps) = " << n << ", " << p_fail << ", " << tol << std::endl;
+    // std::cout << "Power iters bound for expectation : " << expectation_bound << std::endl;
+    // std::cout << "Power iters bound for probability : " << probability_bound << std::endl;
+    int64_t num_iters = std::max(expectation_bound, probability_bound);
+    return num_iters;
+}
+
 template <typename T, typename FUNC, typename RNG>
-std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, const RNGState<RNG> &state) {
+std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, T failure_prob, const RNGState<RNG> &state) {
     auto next_state = RandBLAS::fill_dense(blas::Layout::ColMajor, {n, 1}, n, 1, 0, 0, v, state);
     std::vector<T> work(n, 0.0);
     T* u = work.data();
     T norm = blas::nrm2(n, v, 1);
     blas::scal(n, (T)1.0/norm, v, 1);
     T lambda = 0.0;
-    T pi = 4*std::atan(1.0);
-    int64_t max_iter = (int64_t) std::ceil(( 1.0 + std::log(std::sqrt(pi * (T)n)) )/ tol );
-    for (int64_t iter = 0; iter < max_iter; ++iter) {
+    //
+    int64_t num_iters = required_powermethod_iters(n, failure_prob, tol);
+    for (int64_t iter = 0; iter < num_iters; ++iter) {
         A(v, u);
         lambda = blas::dot(n, v, 1, u, 1);
         blas::copy(n, u, 1, v, 1);
@@ -230,8 +249,19 @@ std::pair<T, RNGState<RNG>> power_method(int64_t n, FUNC &A, T* v, T tol, const
     return {lambda, next_state};
 }
 
+// Note that if we're only interested in subspace embedding distortion then it would suffice to just bound
+// the eigenvalue of A-I with largest absolute value (which might be negative). If we went with that approach
+// then we could make do with one run of a power method instead of running the power method on A and inv(A).
+//
+// The convergence results I know for the power method that don't require a spectral gap are specifically
+// for PSD matrices. Now, we could just run the power method implicitly on the PSD matrix (A - I)^2.
+// This require the same number of matrix-vector multiplications, but it remove the need for ever
+// accessing inv(A) as a linear operator (which we do right now by decomposing A and forming invA explicitly,
+// so we can get away with GEMV). That's useful if A is a fast operator (whether or not that's the case 
+// might be delicate since it's a Gram matrix of a sketch S*U).
+//
 template <typename T, typename RNG>
-std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eigvecs, T tol, const RNGState<RNG> &state,  std::vector<T> work) {
+std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eigvecs, T tol, T failure_prob, const RNGState<RNG> &state,  std::vector<T> work) {
     auto layout = blas::Layout::ColMajor;
     RandBLAS::util::require_symmetric(layout, A, n, n, (T) 0.0);
 
@@ -239,7 +269,7 @@ std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eig
     auto A_func = [layout, A, n](const T* x, T* y) {
         blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, A, n, x, 1, (T) 0.0, y, 1);
     };
-    auto [lambda_max, next_state] = power_method<T>(n, A_func, eigvecs, tol, state);
+    auto [lambda_max, next_state] = power_method<T>(n, A_func, eigvecs, tol, failure_prob, state);
 
     // To compute the smallest eigenpair we'll explicitly invert A. This requires
     // 2n^2 workspace: n^2 workspace for Cholesky of A (since we don't want to destroy A)
@@ -268,7 +298,7 @@ std::tuple<T, T, RNGState<RNG>> exeigs_powermethod(int64_t n, const T* A, T* eig
         blas::gemv(layout, blas::Op::NoTrans, n, n, (T) 1.0, invA, n, x, 1, (T) 0.0, y, 1);
         return;
     };
-    auto [lambda_min, final_state] = power_method<T>(n, invA_func, eigvecs + n, tol, next_state);
+    auto [lambda_min, final_state] = power_method<T>(n, invA_func, eigvecs + n, tol, failure_prob, next_state);
     lambda_min = 1.0/lambda_min;
 
     return {lambda_max, lambda_min, final_state};
diff --git a/test/test_basic_rng/test_distortion.cc b/test/test_basic_rng/test_distortion.cc
index 84bae8b7..43429bab 100644
--- a/test/test_basic_rng/test_distortion.cc
+++ b/test/test_basic_rng/test_distortion.cc
@@ -43,74 +43,80 @@ using RandBLAS::RNGState;
 #include <gtest/gtest.h>
 
 
-int64_t get_min_dimension(double p, double tau) {
-    double val = std::sqrt(-2 * std::log(p)) + 1;
-    val /= tau;
-    val *= val;
-    return (int64_t) std::ceil(val);
-}
-
 class TestSubspaceDistortion : public ::testing::Test {
     protected:
 
     template <typename T>
-    void run_gaussian(T distortion, T tau, T p_fail_bound, uint32_t key) {
-        /**
-         *  Generate a d-by-N Gaussian matrix, where d = gamma*N,
-         *  gamma = (r/delta)^2, and N is the smallest integer where n > N implies
-         *  n*(r - 1  - 1/sqrt(n))^2 >= 2*log(1/p_fail_bound).
-         *  One can verify that this value for N is given as
-         *      N = ceil( ([sqrt(2*log(1/p)) + 1]/(r-1))^2 )   if   r > 1
-         * 
-         *  With probability at least 1 - p_fail_bound, the spectrum
-         *  of the generated matrix will lay in the interval
-         *  [1 - distortion, 1 + distortion].
-         * 
-         * ----------------------
-         * Temporary notes
-         * ----------------------
-         * Find N = min{ n : exp(-t^2 gamma n ) <= p_fail_bound }, where
-         *       t := delta - (gamma)^{-1/2}(1 + 1/sqrt(n)), and 
-         *   gamma := (r/delta)^2.
-         * 
-         * Choosing gamma of this form with r > 1 ensures that no 
-         * matter the value of delta in (0, 1) there always an N
-         * so that probability bound holds whenever n >= N. We know of no bounds
-         * available when r is in (0, 1).
-         * 
-         * ---------------------
-         * if delta = 1/sqrt(2) then we're looking at gamma = 2*r^2.
-         * Or, setting r = 1 + tau for tau > 0, we're looking at gamma=2*(1+tau)^2
-         * to get convergence rate tau in the sense that we fall below a target
-         * failure probability once
-         *  N = ceil( ( [sqrt(2*log(1/p)) + 1] / tau )^2 )
-         *  
-         */ 
+    void run_general(DenseDistName name, T distortion, int64_t d, int64_t N, uint32_t key) {
         auto layout = blas::Layout::ColMajor;
-        int64_t N = get_min_dimension(p_fail_bound, tau);
-        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) * N );
-        DenseDist D(d, N, DenseDistName::Gaussian);
+        DenseDist D(d, N, name);
         std::vector<T> S(d*N);
         std::cout << "(d, N) = ( " << d << ", " << N << " )\n";
         RandBLAS::RNGState<r123::Philox4x32> state(key);
         auto next_state = RandBLAS::fill_dense(D, S.data(), state);
-        blas::scal(d*N, (T)1.0/std::sqrt(d), S.data(), 1);
+        T inv_stddev = (name == DenseDistName::Gaussian) ? (T) 1.0 : (T) 1.0;
+        blas::scal(d*N, inv_stddev / std::sqrt(d), S.data(), 1);
         std::vector<T> G(N*N, 0.0);
         blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, N, d, (T)1.0, S.data(), d, (T)0.0, G.data(), N);
         RandBLAS::util::symmetrize(layout, blas::Uplo::Upper, N, G.data(), N);
         
         std::vector<T> eigvecs(2*N, 0.0);
         std::vector<T> subwork{};
-        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(N, G.data(), eigvecs.data(), (T) 1e-2, state, subwork);
+        T powermethod_reltol = 1e-2;
+        T powermethod_failprob = 1e-6;
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(
+            N, G.data(), eigvecs.data(), powermethod_reltol, powermethod_failprob, state, subwork
+        );
         T sigma_max = std::sqrt(lambda_max);
         T sigma_min = std::sqrt(lambda_min);
         ASSERT_LE(sigma_max, 1+distortion);
         ASSERT_GE(sigma_min, 1-distortion);
         return;
     }
+
+    template <typename T>
+    void run_gaussian(T distortion, T tau, T p_fail_bound, uint32_t key) {
+        /**
+         *  Generate a d-by-N random matrix, where d = gamma*N,
+         *  gamma = ((1 + tau)/delta)^2, and N is the smallest integer where n > N implies
+         *  n*(tau  - 1/sqrt(n))^2 >= 2*log(1/p_fail_bound).
+         *  One can verify that this value for N is given as
+         *      N = ceil( ([sqrt(2*log(1/p)) + 1]/ tau )^2 )
+         * 
+         * ----------------------
+         * Temporary notes
+         * ----------------------
+         * Find N = min{ n : exp(-t^2 gamma n ) <= p_fail_bound }, where
+         *       t := delta - (gamma)^{-1/2}(1 + 1/sqrt(n)), and 
+         *   gamma := ((1+tau)/delta)^2.
+         * 
+         * Choosing gamma of this form with tau > 0 ensures that no 
+         * matter the value of delta in (0, 1) there always an N
+         * so that probability bound holds whenever n >= N.
+         * 
+         */ 
+        double val = std::sqrt(-2 * std::log(p_fail_bound)) + 1;
+        val /= tau;
+        val *= val;
+        int64_t N = (int64_t) std::ceil(val);
+        int64_t d = std::ceil( std::pow((1 + tau) / distortion, 2) * N );
+        run_general<T>(DenseDistName::Gaussian, distortion, d, N, key);
+        return;
+    }
+
+    template <typename T>
+    void run_uniform(T distortion, T rate, T p_fail_bound, uint32_t key) {
+        int64_t N = std::ceil(std::log((T)2 / p_fail_bound) / rate);
+        T c6 = 1.0; // definitely not high enough.
+        T epsnet_spectralnorm_factor = 1.0; // should be 4.0
+        T theta = epsnet_spectralnorm_factor * c6 * (rate + std::log(9));
+        int64_t d = std::ceil(N * theta * std::pow(distortion, -2));
+        run_general<T>(DenseDistName::Uniform, distortion, d, N, key);
+        return;
+    }
 };
 
-TEST_F(TestSubspaceDistortion, gaussian_float_tau_100_fail_00001) {
+TEST_F(TestSubspaceDistortion, gaussian_float_rate_100_fail_00001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     for (uint32_t i = 0; i < 3; ++i ) {
@@ -120,13 +126,34 @@ TEST_F(TestSubspaceDistortion, gaussian_float_tau_100_fail_00001) {
     }
 }
 
-TEST_F(TestSubspaceDistortion, gaussian_float_tau_020_fail_00001) {
+TEST_F(TestSubspaceDistortion, gaussian_float_rate_002_fail_00001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    float tau = 0.2f; // the convergence rate depends on tau^2.
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_gaussian<float>(0.75f, tau, p_fail, key + i);
+        run_gaussian<float>(0.50f, tau, p_fail, key + i);
+        run_gaussian<float>(0.25f, tau, p_fail, key + i);
+    }
+}
+
+TEST_F(TestSubspaceDistortion, uniform_float_rate_100_fail_00001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     for (uint32_t i = 0; i < 3; ++i ) {
-        run_gaussian<float>(0.75f, 0.2f, p_fail, key + i);
-        run_gaussian<float>(0.50f, 0.2f, p_fail, key + i);
-        run_gaussian<float>(0.25f, 0.2f, p_fail, key + i);
+        run_uniform<float>(0.50f, 1.0f, p_fail, key + i);
+        run_uniform<float>(0.25f, 1.0f, p_fail, key + i);
+        run_uniform<float>(0.10f, 1.0f, p_fail, key + i);
     }
 }
 
+TEST_F(TestSubspaceDistortion, uniform_float_rate_004_fail_00001) {
+    uint32_t key = 8673309;
+    float p_fail = 1e-3;
+    float rate = 0.04;
+    for (uint32_t i = 0; i < 3; ++i ) {
+        run_uniform<float>(0.50f, rate, p_fail, key + i);
+        run_uniform<float>(0.25f, rate, p_fail, key + i);
+        run_uniform<float>(0.10f, rate, p_fail, key + i);
+    }
+}
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index ac040170..3e324684 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -318,7 +318,7 @@ class TestHandrolledEigvals : public ::testing::Test {
         std::vector<T> subwork{};
         T tol = 1e-3;
         RNGState state(key + 1);
-        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, state, subwork);
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, (T) 1e-6, state, subwork);
 
         std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
         std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;

From c5c80b2961b7b87a0f9f6a6eaba0b531e610b431 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 21 Aug 2024 16:44:09 -0400
Subject: [PATCH 32/37] implementation cleaned up and all tests pass

---
 test/test_basic_rng/rng_common.hh          | 34 ++++++++++++++++---
 test/test_basic_rng/test_continuous.cc     |  2 +-
 test/test_basic_rng/test_discrete.cc       | 38 +++++++++-------------
 test/test_datastructures/test_denseskop.cc |  2 +-
 4 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/test/test_basic_rng/rng_common.hh b/test/test_basic_rng/rng_common.hh
index 21cbf04d..736d8734 100644
--- a/test/test_basic_rng/rng_common.hh
+++ b/test/test_basic_rng/rng_common.hh
@@ -169,6 +169,20 @@ double critical_value_rep_mutator(TI &n, double &sig) {
 
 }
 
+// Function to check the KS-Stat against crit values
+template <typename T>
+std::pair<int, double> ks_check_critval(const std::vector<T> &cdf1, const std::vector<T> &cdf2, double critical_value) {
+    assert(cdf1.size() == cdf2.size()); // Vectors must be of same size to perform test
+
+    for (size_t i = 0; i < cdf1.size(); ++i) {
+        double diff = std::abs(cdf1[i] - cdf2[i]);
+        if (diff > critical_value) {
+            return {i, diff}; // the test failed.
+        }
+    }
+    return {-1, 0.0}; // interpret a negative return value as the test passing.
+}
+
 //
 // MARK: combinatorics
 //
@@ -215,6 +229,18 @@ inline double hypergeometric_pmf(int64_t N, int64_t K, int64_t D, int64_t observ
     return out;
 }
 
+template <typename T>
+std::vector<T> hypergeometric_pmf_arr(int64_t N, int64_t K, int64_t D) {
+    randblas_require(0 <= K && K <= N);
+    randblas_require(0 <= D && D <= N);
+
+    std::vector<float> pmf(K + 1);
+    for (int64_t i = 0; i <= K; ++i) {
+        pmf[i] = hypergeometric_pmf(N, K, D, i);
+    }
+    return pmf;
+}
+
 inline double hypergeometric_mean(int64_t N, int64_t K, int64_t D) {
     double dN = (double) N;
     double dK = (double) K;
@@ -242,12 +268,12 @@ inline T standard_normal_cdf(T x) {
 }
 
 template <typename T>
-inline T uniform_neg11_cdf(T x) {
-    if (x <= -1)
+inline T uniform_syminterval_cdf(T x, T radius) {
+    if (x <= -radius)
         return 0;
-    if (x >= 1)
+    if (x >= radius)
         return 1;
-    return (x + 1) / 2;
+    return (x + radius) / (2*radius);
 }
 
 } // end namespace RandBLAS_StatTests
diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
index 398f08c7..fec6ba4a 100644
--- a/test/test_basic_rng/test_continuous.cc
+++ b/test/test_basic_rng/test_continuous.cc
@@ -60,7 +60,7 @@ class TestScalarDistributions : public ::testing::Test {
             if (dn == DenseDistName::Gaussian) {
                 return RandBLAS_StatTests::standard_normal_cdf(x);
             } else if (dn == DenseDistName::Uniform) {
-                return RandBLAS_StatTests::uniform_neg11_cdf(x);
+                return RandBLAS_StatTests::uniform_syminterval_cdf(x, (T) std::sqrt(3));
             } else {
                 std::string msg = "Unrecognized distributions name";
                 throw std::runtime_error(msg);
diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc
index 852d7692..69382e07 100644
--- a/test/test_basic_rng/test_discrete.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -165,8 +165,7 @@ class TestSampleIndices : public ::testing::Test
     // Mark: Without Replacement Tests
     //
 
-    static std::vector<float> fisher_yates_cdf(const std::vector<int64_t> &idxs_major, int64_t K, int64_t num_samples)
-    {
+    static std::vector<float> fisher_yates_cdf(const std::vector<int64_t> &idxs_major, int64_t K, int64_t num_samples) {
         using RandBLAS::util::weights_to_cdf;
         std::vector<float> empirical_cdf;
 
@@ -212,8 +211,7 @@ class TestSampleIndices : public ::testing::Test
         }
     }
 
-    static void single_test_fisher_yates_kolmogorov_smirnov(int64_t N, int64_t K, double significance, int64_t num_samples, uint32_t seed)
-    {
+    static void single_test_fisher_yates_kolmogorov_smirnov(int64_t N, int64_t K, double significance, int64_t num_samples, uint32_t seed) {
         using RandBLAS::repeated_fisher_yates;
         using RandBLAS_StatTests::hypergeometric_pmf_arr;
         using RandBLAS::util::weights_to_cdf;
@@ -231,7 +229,7 @@ class TestSampleIndices : public ::testing::Test
             idxs_major.data(), idxs_minor.data(), vals.data());
 
         // Generate the true hypergeometric cdf
-        std::vector<float> true_pmf = hypergeometric_pmf_arr(N, K, K);
+        std::vector<float> true_pmf = hypergeometric_pmf_arr<float>(N, K, K);
         weights_to_cdf(true_pmf.size(), true_pmf.data());
         std::vector<float> true_cdf = true_pmf; // Rename for clarity
 
@@ -242,31 +240,25 @@ class TestSampleIndices : public ::testing::Test
         return;
     }
 
-    static int64_t special_increment_k(int64_t K, int64_t N, const std::vector<int64_t> &K_bounds = {9, 99})
-    {
-        //
-        // Example evolution of K with N=1000: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 23, 39, 64, 100,
-        //
-        double exp_base = std::pow(10.0, 1.0 / 10.0); // Log base to give 10 steps for each order of magnitude
-
-        if (K <= K_bounds[0]) {
-            // Step one-by-one up to K_bounds[0] (e.g., 9)
-            return K + 1;
-        } else if (K <= K_bounds[1]) {
-            // Step in square root scale up to K_bounds[1] (e.g., 99)
-            return static_cast<int64_t>(std::pow(std::sqrt(K) + 1, 2));
+    static void incr_with_phase_transitions(int64_t &K, int64_t unit_bound = 10, int64_t sqrt_bound = 100) {
+        if (K < unit_bound) {
+            K += 1;
+        } else if (K < sqrt_bound) {
+            // Step in square root scale up to sqrt_bound
+            K += (int64_t) std::pow(std::sqrt(K) + 1, 2);
         } else {
-            // Step in log-scale after K_bounds[1]
-            return static_cast<int64_t>(K * exp_base);
+            // Step in log-scale after sqrt_bound
+            // Log base to give 5 steps for each order of magnitude
+            K *= std::pow(10.0, 0.2);
         }
+        return;
     }
 
-    static void test_fisher_yates_kolmogorov_smirnov(int64_t N, double significance, int64_t num_samples, uint32_t seed)
-    {
+    static void test_fisher_yates_kolmogorov_smirnov(int64_t N, double significance, int64_t num_samples, uint32_t seed) {
         int64_t K = 0;
         while (K <= N) {
             single_test_fisher_yates_kolmogorov_smirnov(N, K, significance, num_samples, seed);
-            K = special_increment_k(K, N);
+            incr_with_phase_transitions(K);
         }
     }
 
diff --git a/test/test_datastructures/test_denseskop.cc b/test/test_datastructures/test_denseskop.cc
index 01ac3355..884243be 100644
--- a/test/test_datastructures/test_denseskop.cc
+++ b/test/test_datastructures/test_denseskop.cc
@@ -149,7 +149,7 @@ TEST_F(TestDenseMoments, Gaussian)
 TEST_F(TestDenseMoments, Uniform)
 {
     auto dn = RandBLAS::DenseDistName::Uniform;
-    double expect_stddev = 1.0 / sqrt(3.0);
+    double expect_stddev = 1.0;
     for (uint32_t key : {0, 1, 2})
     {
         test_mean_stddev<float>(key, 500, 500, dn, (float) expect_stddev);

From 08486e3b9dd06588c940fa4ed782265cbf607e7d Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 21 Aug 2024 17:05:01 -0400
Subject: [PATCH 33/37] change default scalar distribution to Gaussian, since
 describing the uniform distribution over [-sqrt(3),sqrt(3)] is awkward.

---
 RandBLAS/dense_skops.hh                       | 23 +++++++++----------
 .../svd_rank1_plus_noise.cc                   |  4 ++--
 rtd/source/tutorial/sampling_skops.rst        |  2 +-
 test/test_basic_rng/test_continuous.cc        |  6 ++---
 test/test_basic_rng/test_distortion.cc        |  8 +++----
 5 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index 1442113b..a751af9b 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -203,11 +203,12 @@ namespace RandBLAS {
 /// that can be used in GEMM.
 enum class DenseDistName : char {
     // ---------------------------------------------------------------------------
-    ///  Indicates the Gaussian distribution with mean 0 and standard deviation 1.
+    ///  Indicates the Gaussian distribution with mean 0 and variance 1.
     Gaussian = 'G',
 
     // ---------------------------------------------------------------------------
-    ///  Indicates the uniform distribution over [-1, 1].
+    ///  Indicates the uniform distribution over [-r, r] where r := sqrt(3)
+    ///  is the radius that provides for a variance of 1.
     Uniform = 'U',
 
     // ---------------------------------------------------------------------------
@@ -268,12 +269,13 @@ struct DenseDist {
 
     // ---------------------------------------------------------------------------
     ///  A distribution over matrices of shape (n_rows, n_cols) with entries drawn
-    ///  iid from either the standard normal distribution or the uniform distribution
-    ///  over [-1, 1]. 
+    ///  iid from either the default choice of standard normal distribution, or from
+    ///  the uniform distribution over [-r, r], where r := sqrt(3) provides for
+    ///  unit variance.
     DenseDist(
         int64_t n_rows,
         int64_t n_cols,
-        DenseDistName dn = DenseDistName::Uniform
+        DenseDistName dn = DenseDistName::Gaussian
     ) : n_rows(n_rows), n_cols(n_cols), family(dn), major_axis( (dn == DenseDistName::BlackBox) ? MajorAxis::Undefined : MajorAxis::Long) { }
 
     DenseDist(
@@ -315,15 +317,12 @@ inline int64_t major_axis_length(const DenseDist &D) {
 
 template <typename T>
 inline T isometry_scale_factor(DenseDist D) {
-    T common = std::pow((T) std::min(D.n_rows, D.n_cols), -0.5);
-    if (D.family == DenseDistName::Gaussian) {
-        return common;
-    } else if (D.family == DenseDistName::Uniform) {
-        // the variance of an r.v. distributed Unif[-1, 1] is 4/12=1/3.
-        return common;
-    } else {
+    if (D.family == DenseDistName::BlackBox) {
         throw std::runtime_error("Unrecognized distribution.");
     }
+    // When we sample from the scalar distributions we always
+    // scale things so they're variance-1. 
+    return std::pow((T) std::min(D.n_rows, D.n_cols), -0.5);
 }
 
 
diff --git a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
index bfd04e6f..4bbf7580 100644
--- a/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
+++ b/examples/sparse-low-rank-approx/svd_rank1_plus_noise.cc
@@ -85,7 +85,7 @@ void iid_sparsify_random_dense(
     #define MAT(_i, _j)  mat[(_i) * stride_row + (_j) * stride_col]
     for (int64_t i = 0; i < n_rows; ++i) {
         for (int64_t j = 0; j < n_cols; ++j) {
-            T v = (SPAR(i, j) + 1.0) / 2.0;
+            T v = (SPAR(i, j) + std::sqrt(3)) / 2.0*std::sqrt(3);
             if (v < prob_of_zero) {
                 MAT(i, j) = 0.0;
             } else {
@@ -190,7 +190,7 @@ void make_noise_matrix(double noise_scale, int64_t m, int64_t n, double prob_of_
     //  NOTE: it would be more efficient to sample vec_nnz*vec_nnz elements without replacement from the index set
     //  from 0 to m*n-1, then de-vectorize those indices (in either row-major or col-major interpretation) and
     //  only sample the values of the nonzeros for these pre-determined structural nonzeros. The current implementation
-    //  has to generate to dense m-by-n matrices whose entries are iid uniform [-1, 1].
+    //  has to generate to dense m-by-n matrices whose entries are iid uniform [-sqrt(3), sqrt(3)].
     //
     using T = typename SpMat::scalar_t;
     using sint_t = typename SpMat::index_t;
diff --git a/rtd/source/tutorial/sampling_skops.rst b/rtd/source/tutorial/sampling_skops.rst
index d101510a..ee549bdb 100644
--- a/rtd/source/tutorial/sampling_skops.rst
+++ b/rtd/source/tutorial/sampling_skops.rst
@@ -50,7 +50,7 @@ However, the *recommended* constructors for these classes just accept two parame
 a representation of a distribution (i.e., a DenseDist or a SparseDist) and an RNGState.
 
 For example, the following code produces a :math:`10000 \times 50` dense sketching operator 
-whose entries are iid samples from the uniform distribution over :math:`[-1, 1]`.
+whose entries are iid samples from the standard normal distribution.
 
    .. code:: c++
 
diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc
index fec6ba4a..03b5d87d 100644
--- a/test/test_basic_rng/test_continuous.cc
+++ b/test/test_basic_rng/test_continuous.cc
@@ -119,7 +119,7 @@ class TestScalarDistributions : public ::testing::Test {
     }
 };
 
-TEST_F(TestScalarDistributions, uneg11_ks_generous) {
+TEST_F(TestScalarDistributions, uniform_ks_generous) {
     double s = 1e-6;
     for (uint32_t i = 999; i < 1011; ++i) {
         run<double>(s, 100000, DenseDistName::Uniform, i);
@@ -128,14 +128,14 @@ TEST_F(TestScalarDistributions, uneg11_ks_generous) {
     }
 }
 
-TEST_F(TestScalarDistributions, uneg11_ks_moderate) {
+TEST_F(TestScalarDistributions, uniform_ks_moderate) {
     double s = 1e-4;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
     run<float>(s, 1000,   DenseDistName::Uniform, 0);
 }
 
-TEST_F(TestScalarDistributions, uneg11_ks_skeptical) {
+TEST_F(TestScalarDistributions, uniform_ks_skeptical) {
     double s = 1e-2;
     run<float>(s, 100000, DenseDistName::Uniform, 0);
     run<float>(s, 10000,  DenseDistName::Uniform, 0);
diff --git a/test/test_basic_rng/test_distortion.cc b/test/test_basic_rng/test_distortion.cc
index 43429bab..dfbabedf 100644
--- a/test/test_basic_rng/test_distortion.cc
+++ b/test/test_basic_rng/test_distortion.cc
@@ -116,7 +116,7 @@ class TestSubspaceDistortion : public ::testing::Test {
     }
 };
 
-TEST_F(TestSubspaceDistortion, gaussian_float_rate_100_fail_00001) {
+TEST_F(TestSubspaceDistortion, gaussian_rate_100_fail_0001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     for (uint32_t i = 0; i < 3; ++i ) {
@@ -126,7 +126,7 @@ TEST_F(TestSubspaceDistortion, gaussian_float_rate_100_fail_00001) {
     }
 }
 
-TEST_F(TestSubspaceDistortion, gaussian_float_rate_002_fail_00001) {
+TEST_F(TestSubspaceDistortion, gaussian_rate_004_fail_0001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     float tau = 0.2f; // the convergence rate depends on tau^2.
@@ -137,7 +137,7 @@ TEST_F(TestSubspaceDistortion, gaussian_float_rate_002_fail_00001) {
     }
 }
 
-TEST_F(TestSubspaceDistortion, uniform_float_rate_100_fail_00001) {
+TEST_F(TestSubspaceDistortion, uniform_rate_100_fail_0001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     for (uint32_t i = 0; i < 3; ++i ) {
@@ -147,7 +147,7 @@ TEST_F(TestSubspaceDistortion, uniform_float_rate_100_fail_00001) {
     }
 }
 
-TEST_F(TestSubspaceDistortion, uniform_float_rate_004_fail_00001) {
+TEST_F(TestSubspaceDistortion, uniform_rate_004_fail_0001) {
     uint32_t key = 8673309;
     float p_fail = 1e-3;
     float rate = 0.04;

From d5abae506a648a12a22b04341f2f3762a2f678ae Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 21 Aug 2024 17:14:03 -0400
Subject: [PATCH 34/37] fix segfault arising from calling the KS test critical
 value mutator after arrays had been allocated.

---
 test/test_basic_rng/test_discrete.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc
index 69382e07..68e80162 100644
--- a/test/test_basic_rng/test_discrete.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -217,6 +217,8 @@ class TestSampleIndices : public ::testing::Test
         using RandBLAS::util::weights_to_cdf;
         using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
 
+        auto critical_value = critical_value_rep_mutator(num_samples, significance);
+
         // Initialize arguments for repeated_fisher_yates
         std::vector<int64_t> idxs_major(K * num_samples);
         std::vector<int64_t> idxs_minor(K * num_samples);
@@ -228,13 +230,11 @@ class TestSampleIndices : public ::testing::Test
             state, K, N, num_samples, // K=vec_nnz, N=dim_major, num_samples=dim_minor
             idxs_major.data(), idxs_minor.data(), vals.data());
 
-        // Generate the true hypergeometric cdf
-        std::vector<float> true_pmf = hypergeometric_pmf_arr<float>(N, K, K);
-        weights_to_cdf(true_pmf.size(), true_pmf.data());
-        std::vector<float> true_cdf = true_pmf; // Rename for clarity
+        // Generate the true hypergeometric cdf (get the pdf first)
+        std::vector<float> true_cdf = hypergeometric_pmf_arr<float>(N, K, K);
+        weights_to_cdf(true_cdf.size(), true_cdf.data());
 
         // Compute the critval and check against it
-        auto critical_value = critical_value_rep_mutator(num_samples, significance);
         fisher_yates_kolmogorov_smirnov_tester(idxs_major, true_cdf, critical_value, N, K, num_samples);
 
         return;

From bbfa43c960d542d779cd021071c90539d7489328 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 21 Aug 2024 18:02:57 -0400
Subject: [PATCH 35/37] remove rouge character

---
 RandBLAS/dense_skops.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index a751af9b..d83e84d7 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -358,7 +358,7 @@ struct DenseSkOp {
 
     T *buff = nullptr;                      // memory
     blas::Layout layout;                    // matrix storage order
-    bool del_buff_on_destruct = false;      // only applies if fill_dense(S) has been called.å
+    bool del_buff_on_destruct = false;      // only applies if fill_dense(S) has been called.
 
 
     /////////////////////////////////////////////////////////////////////

From 0718d2bdb2f2df66d8fe8d4d1c4d1f80d2fca1cf Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 21 Aug 2024 22:06:11 -0400
Subject: [PATCH 36/37] Introduce (mostly empty) SketchingDistribution and
 SketchingOperator C++20 concepts. Bring DenseSkOp and SparseSkOp more in
 alignment with one another. Simplify some unnecessarily verbose templating
 for functions that accepted sketching operators or sparse matrices as inputs.

---
 RandBLAS/base.hh                             |  86 ++++++++++---
 RandBLAS/dense_skops.hh                      |  23 ++--
 RandBLAS/skge.hh                             |  11 +-
 RandBLAS/sparse_skops.hh                     |  27 ++--
 rtd/source/api_reference/skops_and_dists.rst |  38 +++++-
 rtd/source/index.rst                         |   5 +-
 rtd/source/tutorial/index.rst                |   5 +-
 rtd/source/tutorial/updates.rst              | 128 +++++++++++++++++++
 test/test_datastructures/test_sparseskop.cc  |   4 +-
 9 files changed, 259 insertions(+), 68 deletions(-)
 create mode 100644 rtd/source/tutorial/updates.rst

diff --git a/RandBLAS/base.hh b/RandBLAS/base.hh
index d4c00a30..9e37b57f 100644
--- a/RandBLAS/base.hh
+++ b/RandBLAS/base.hh
@@ -119,7 +119,7 @@ concept SignedInteger = (std::numeric_limits<T>::is_signed && std::numeric_limit
 
 
 template <SignedInteger TI, SignedInteger TO = int64_t>
-inline TO safe_signed_int_product(TI a, TI b) {
+inline TO safe_int_product(TI a, TI b) {
     if (a == 0 || b == 0) {
         return 0;
     }
@@ -155,27 +155,38 @@ enum class MajorAxis : char {
  * the counter and the key. The arrays' types are statically sized, small
  * (typically of length 2 or 4), and can be distinct from one another.
  * 
- * @tparam RNG A CBRNG type in defined in Random123. We've found that Philox-based
- * CBRNGs work best for our purposes, but we also support Threefry-based CBRNGS.
+ * The template parameter RNG is a CBRNG type in defined in Random123. We've found
+ * that Philox-based CBRNGs work best for our purposes, but we also support Threefry-based CBRNGS.
  */
 template <typename RNG = r123::Philox4x32>
 struct RNGState
 {
     using generator = RNG;
-    // The unsigned integer type used in this RNGState's counter array.
-    using ctr_uint_type = typename RNG::ctr_type::value_type;
-    /// @brief The unsigned integer type used in this RNGState's key array.
-    ///        This is typically std::uint32_t, but it can be std::uint64_t.
-    using key_uint_type = typename RNG::key_type::value_type;
-    // An array type defined in Random123.
+    
     using ctr_type = typename RNG::ctr_type;
-    // An array type defined in Random123.
+    // ^ An array type defined in Random123.
     using key_type = typename RNG::key_type;
+    // ^ An array type defined in Random123.
+    using ctr_uint = typename RNG::ctr_type::value_type;
+    // ^ The unsigned integer type used in this RNGState's counter array.
+
+    /// -------------------------------------------------------------------
+    /// @brief The unsigned integer type used in this RNGState's key array.
+    ///        This is typically std::uint32_t, but it can be std::uint64_t.
+    using key_uint = typename RNG::key_type::value_type;
+
 
     const static int len_c = RNG::ctr_type::static_size;
     const static int len_k = RNG::key_type::static_size;
-    typename RNG::ctr_type counter; ///< This RNGState's counter array.
-    typename RNG::key_type key;     ///< This RNGState's key array.
+    typename RNG::ctr_type counter;
+    // ^ This RNGState's counter array.
+
+    /// ------------------------------------------------------------------
+    /// This RNGState's key array. If you want to manually advance the key
+    /// by an integer increment of size "step," then you do so by calling 
+    /// this->key.incr(step).
+    typename RNG::key_type key;
+
 
     /// Initialize the counter and key arrays to all zeros.
     RNGState() : counter{{0}}, key(key_type{{}}) {}
@@ -191,7 +202,7 @@ struct RNGState
 
     /// Initialize the counter array to all zeros. Initialize the key array to have first
     /// element equal to k and all other elements equal to zero.
-    RNGState(key_uint_type k) : counter{{0}}, key{{k}} {}
+    RNGState(key_uint k) : counter{{0}}, key{{k}} {}
 
     ~RNGState() {};
 
@@ -207,16 +218,16 @@ template <typename RNG>
 RNGState<RNG>::RNGState(
     const RNGState<RNG> &s
 ) {
-    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint_type));
-    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint_type));
+    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint));
+    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint));
 }
 
 template <typename RNG>
 RNGState<RNG> &RNGState<RNG>::operator=(
     const RNGState &s
 ) {
-    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint_type));
-    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint_type));
+    std::memcpy(this->counter.v, s.counter.v, this->len_c * sizeof(ctr_uint));
+    std::memcpy(this->key.v,     s.key.v,     this->len_k * sizeof(key_uint));
     return *this;
 }
 
@@ -239,4 +250,45 @@ std::ostream &operator<<(
     return out;
 }
 
+// =============================================================================
+/// @verbatim embed:rst:leading-slashes
+///
+/// .. NOTE: \ttt expands to \texttt (its definition is given in an rst file)
+///
+/// Words. Hello!
+///
+/// @endverbatim
+template<typename SkDist>
+concept SketchingDistribution = requires(SkDist D) {
+    { D.n_rows }     -> std::convertible_to<const int64_t>;
+    { D.n_cols }     -> std::convertible_to<const int64_t>;
+    { D.major_axis } -> std::convertible_to<const MajorAxis>;
+};
+
+// =============================================================================
+/// \fn isometry_scale_factor(SkDist D) 
+/// @verbatim embed:rst:leading-slashes
+/// Words here ...
+/// @endverbatim
+template <typename T, SketchingDistribution SkDist>
+inline T isometry_scale_factor(SkDist D);
+
+
+// =============================================================================
+/// @verbatim embed:rst:leading-slashes
+///
+/// .. NOTE: \ttt expands to \texttt (its definition is given in an rst file)
+///
+/// Words. Hello!
+///
+/// @endverbatim
+template<typename SkOp>
+concept SketchingOperator = requires(SkOp S) {
+    { S.n_rows }     -> std::convertible_to<const int64_t>;
+    { S.n_cols }     -> std::convertible_to<const int64_t>;
+    { S.seed_state } -> std::convertible_to<const typename SkOp::state_t>;
+    { S.seed_state } -> std::convertible_to<const typename SkOp::state_t>;
+};
+
+
 } // end namespace RandBLAS::base
diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh
index d83e84d7..5cef11f1 100644
--- a/RandBLAS/dense_skops.hh
+++ b/RandBLAS/dense_skops.hh
@@ -135,7 +135,7 @@ static RNGState<RNG> fill_dense_submat_impl(int64_t n_cols, T* smat, int64_t n_s
     #pragma omp for schedule(static)
     for (int64_t row = 0; row < n_srows; row++) {
 
-        int64_t incr_from_c = safe_signed_int_product(ctr_inter_row_stride, row);
+        int64_t incr_from_c = safe_int_product(ctr_inter_row_stride, row);
     
         auto c_row = c;
         c_row.incr(incr_from_c);
@@ -185,7 +185,7 @@ RNGState<RNG> compute_next_state(DD dist, RNGState<RNG> state) {
         pad = ctr_size - major_len % ctr_size;
     }
     int64_t ctr_major_axis_stride = (major_len + pad) / ctr_size;
-    int64_t full_incr = safe_signed_int_product(ctr_major_axis_stride, minor_len);
+    int64_t full_incr = safe_int_product(ctr_major_axis_stride, minor_len);
     state.counter.incr(full_incr);
     return state;
 }
@@ -332,9 +332,8 @@ inline T isometry_scale_factor(DenseDist D) {
 template <typename T, typename RNG = r123::Philox4x32>
 struct DenseSkOp {
 
-    using generator = RNG;
-    using state_type = RNGState<RNG>;
-    using buffer_type = T;
+    using state_t  = RNGState<RNG>;
+    using scalar_t = T;
 
     const int64_t n_rows;
     const int64_t n_cols;
@@ -404,9 +403,8 @@ struct DenseSkOp {
         seed_state(state),
         next_state(dense::compute_next_state(dist, state)),
         buff(nullptr),
-        layout(dist_to_layout(dist)
-    ) {
-        // sanity checks
+        layout(dist_to_layout(dist))
+    {   // sanity checks
         randblas_require(this->dist.n_rows > 0);
         randblas_require(this->dist.n_cols > 0);
         if (dist.family == DenseDistName::BlackBox)
@@ -497,11 +495,11 @@ RNGState<RNG> fill_dense(blas::Layout layout, const DenseDist &D, int64_t n_rows
         // operate on the transpose in row-major
         n_rows_ = n_cols;
         n_cols_ = n_rows;
-        ptr = ro_s + safe_signed_int_product(co_s, ma_len);
+        ptr = ro_s + safe_int_product(co_s, ma_len);
     } else {
         n_rows_ = n_rows;
         n_cols_ = n_cols;
-        ptr = safe_signed_int_product(ro_s, ma_len) + co_s;
+        ptr = safe_int_product(ro_s, ma_len) + co_s;
     }
     RNGState<RNG> next_state{};
     switch (D.family) {
@@ -582,8 +580,9 @@ RNGState<RNG> fill_dense(const DenseDist &D, T *buff, const RNGState<RNG> &seed)
 /// @param[in] S
 ///     A DenseSkOp object.
 ///    
-template <typename T, typename RNG>
-void fill_dense(DenseSkOp<T,RNG> &S) {
+template <typename DenseSkOp>
+void fill_dense(DenseSkOp &S) {
+    using T = typename DenseSkOp::scalar_t;
     randblas_require(S.buff == nullptr);
     randblas_require(S.dist.family != DenseDistName::BlackBox);
     S.buff = new T[S.dist.n_rows * S.dist.n_cols];
diff --git a/RandBLAS/skge.hh b/RandBLAS/skge.hh
index 316a8863..89ff455d 100644
--- a/RandBLAS/skge.hh
+++ b/RandBLAS/skge.hh
@@ -502,12 +502,9 @@ inline void lskges(
 ) {
     if (!S.known_filled)
         fill_sparse(S);
-    using RNG = typename SKOP::RNG_t;
-    using sint_t = typename SKOP::index_t;
-    auto Scoo = coo_view_of_skop<T,RNG,sint_t>(S);
+    auto Scoo = coo_view_of_skop(S);
     left_spmm(
-        layout, opS, opA, d, n, m, alpha, Scoo, ro_s, co_s,
-        A, lda, beta, B, ldb
+        layout, opS, opA, d, n, m, alpha, Scoo, ro_s, co_s, A, lda, beta, B, ldb
     );
     return;
 }
@@ -636,9 +633,7 @@ inline void rskges(
 ) { 
     if (!S.known_filled)
         fill_sparse(S);
-    using RNG = typename SKOP::RNG_t;
-    using sint = typename SKOP::index_t;
-    auto Scoo = coo_view_of_skop<T,RNG,sint>(S);
+    auto Scoo = coo_view_of_skop(S);
     right_spmm(
         layout, opA, opS, m, d, n, alpha, A, lda, Scoo, ro_s, co_s, beta, B, ldb
     );
diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index 997d11d1..6e90651e 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -172,9 +172,9 @@ inline T isometry_scale_factor(SparseDist D) {
 template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
 struct SparseSkOp {
 
-    using RNG_t = RNG;
-    using T_t = T;
     using index_t = sint_t;
+    using state_t = RNGState<RNG>;
+    using scalar_t = T;
 
     const int64_t n_rows;
     const int64_t n_cols;
@@ -375,16 +375,16 @@ struct SparseSkOp {
 /// @param[in] S
 ///     SparseSkOp object.
 ///     
-template <typename T, typename RNG, SignedInteger sint_t>
-void fill_sparse(
-    SparseSkOp<T,RNG,sint_t> & S
-) {
+template <typename SparseSkOp>
+void fill_sparse(SparseSkOp &S) {
+    
     int64_t long_ax_len = MAX(S.dist.n_rows, S.dist.n_cols);
     int64_t short_ax_len = MIN(S.dist.n_rows, S.dist.n_cols);
-
     bool is_wide = S.dist.n_rows == short_ax_len;
+
+    using sint_t = typename SparseSkOp::index_t;
     sint_t *short_ax_idxs = (is_wide) ? S.rows : S.cols;
-    sint_t *long_ax_idxs = (is_wide) ? S.cols : S.rows;
+    sint_t *long_ax_idxs  = (is_wide) ? S.cols : S.rows;
 
     if (S.dist.major_axis == MajorAxis::Short) {
         sparse::repeated_fisher_yates(
@@ -397,7 +397,6 @@ void fill_sparse(
             long_ax_idxs, short_ax_idxs, S.vals
         );
     }
-    // TODO: add check that S.next_state == output from the repeated_fisher_yates function.
     S.known_filled = true;
     return;
 }
@@ -470,16 +469,12 @@ static int64_t nnz(
     }
 }
 
-
-template <typename T, typename RNG, SignedInteger sint_t>
-COOMatrix<T,sint_t> coo_view_of_skop(SparseSkOp<T,RNG,sint_t> &S) {
+template <typename SkOp, typename T = SkOp::scalar_t, typename sint_t = SkOp::index_t>
+COOMatrix<T, sint_t> coo_view_of_skop(SkOp &S) {
     if (!S.known_filled)
         fill_sparse(S);
     int64_t nnz = RandBLAS::sparse::nnz(S);
-    COOMatrix<T,sint_t> A(
-        S.dist.n_rows, S.dist.n_cols, nnz,
-        S.vals, S.rows, S.cols
-    );
+    COOMatrix<T, sint_t> A(S.dist.n_rows, S.dist.n_cols, nnz, S.vals, S.rows, S.cols);
     return A;
 }
 
diff --git a/rtd/source/api_reference/skops_and_dists.rst b/rtd/source/api_reference/skops_and_dists.rst
index 72508826..910f7805 100644
--- a/rtd/source/api_reference/skops_and_dists.rst
+++ b/rtd/source/api_reference/skops_and_dists.rst
@@ -4,12 +4,36 @@ Distributions and sketching operators
 
 .. _rngstate_api:
 
-The state of a random number generator
-================================================
 
-.. doxygenstruct:: RandBLAS::RNGState
-   :project: RandBLAS
-   :members:
+Sketching distributions
+=======================
+
+.. doxygenconcept:: RandBLAS::SketchingDistribution
+    :project: RandBLAS
+
+.. doxygenfunction:: RandBLAS::isometry_scale_factor(SkDist D)
+    :project: RandBLAS
+
+
+Sketching operators and random states
+=====================================
+
+.. dropdown:: Sketching operators
+    :animate: fade-in-slide-down
+    :color: light
+    
+    .. doxygenconcept:: RandBLAS::SketchingOperator
+        :project: RandBLAS
+
+
+.. dropdown:: The state of a random number generator
+    :animate: fade-in-slide-down
+    :color: light
+
+    .. doxygenstruct:: RandBLAS::RNGState
+      :project: RandBLAS
+      :members:
+
 
 
 .. _densedist_and_denseskop_api:
@@ -28,7 +52,7 @@ DenseDist and DenseSkOp
    :project: RandBLAS
    :members: 
 
-.. doxygenfunction:: RandBLAS::fill_dense(DenseSkOp<T, RNG> &S)
+.. doxygenfunction:: RandBLAS::fill_dense(DenseSkOp &S)
    :project: RandBLAS
 
 
@@ -45,7 +69,7 @@ SparseDist and SparseSkOp
    :project: RandBLAS
    :members: 
 
-.. doxygenfunction:: RandBLAS::fill_sparse(SparseSkOp<T, RNG, sint_t> &S)
+.. doxygenfunction:: RandBLAS::fill_sparse(SparseSkOp &S)
    :project: RandBLAS
 
 
diff --git a/rtd/source/index.rst b/rtd/source/index.rst
index 16a09d25..0efd91d9 100644
--- a/rtd/source/index.rst
+++ b/rtd/source/index.rst
@@ -18,9 +18,10 @@ We built RandBLAS to make it easier to write, debug, and deploy high-performance
 
 RandBLAS is efficient, flexible, and reliable.
 It uses CPU-based OpenMP acceleration to apply its sketching operators to dense or sparse data matrices stored in main memory.
+It includes dense and sparse sketching operators (e.g., Gaussian operators, CountSketch, OSNAPs, etc..), which can 
+be applied to dense or sparse data in any combination that leads to a dense sketch.
 
-All sketches produced by RandBLAS are dense.
-If you have RandBLAS and an LAPACK-like library at your disposal, you can implement
+With RandBLAS and an LAPACK-like library at your disposal, you can implement
 a huge range of shared-memory randomized algorithms for matrix computations.
 RandBLAS can be used in distributed environments through its ability to compute products with *submatrices* of sketching operators, 
 without ever realizing the entire sketching operator in memory.
diff --git a/rtd/source/tutorial/index.rst b/rtd/source/tutorial/index.rst
index 2b677809..aa0662fb 100644
--- a/rtd/source/tutorial/index.rst
+++ b/rtd/source/tutorial/index.rst
@@ -45,15 +45,12 @@ For example, it lets you set an integer-valued the seed when defining :math:`\te
 It even lets you compute products against *submatrices* of sketching operators without ever forming the full operator in memory.
 
 
-.. note::
-  This tutorial is very much incomplete! Bear with us for the time being.
-
-
 .. toctree::
     :maxdepth: 4
 
     Background on GEMM <gemm>
     Defining a sketching distribution <distributions>
     Sampling a sketching operator <sampling_skops>
+    Updating a sketch <updates>
     The meaning of "submat(・)" in RandBLAS documentation <submatrices>
 
diff --git a/rtd/source/tutorial/updates.rst b/rtd/source/tutorial/updates.rst
new file mode 100644
index 00000000..a553d475
--- /dev/null
+++ b/rtd/source/tutorial/updates.rst
@@ -0,0 +1,128 @@
+*********************************************************************************************
+Updating sketches with dense sketching operators
+*********************************************************************************************
+
+  .. |denseskop| mathmacro:: \texttt{DenseSkOp}
+  .. |seedstate| mathmacro:: \texttt{seed_state}
+  .. |nextstate| mathmacro:: \texttt{next_state}
+  .. |mtx| mathmacro:: {}
+
+
+RandBLAS makes it easy to *implicitly* extend an initial sketching
+operator :math:`\mtx{S}_1` into an augmented operator :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` or :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]`.
+There are four scenarios that you can find yourself in where
+this can be done without generating :math:`S` from scratch.
+In all four scenarios, the idea is to
+use the :math:`\nextstate` of :math:`\mtx{S}_1` as the
+:math:`\seedstate` of :math:`\mtx{S}_2`.
+
+There are two reasons why you'd want to
+extend a sketching operator; you might be trying to improve statistical
+quality by increasing sketch size (Scenarios 1 and 4), or you might be
+incorporating new data into a sketch of fixed size (Scenarios 2 and 3).
+The unifying perspective on Scenarios 1 and 4 is that they both add
+*long-axis vectors* to the sketching operator.
+The unifying perspective on
+Scenarios 2 and 3 is that they both add *short-axis vectors* to the
+sketching operator. 
+
+:math:`\texttt{DenseDist}` objects have a :math:`\texttt{major_axis}` member, which states
+whether operators sampled from that distribution are short-axis or
+long-axis major. So when you specify the major axis for a sketching
+operator, you're basically saying whether you want to keep open the possibility of
+improving the statistical quality of a sketch or updating a sketch to
+incorporate more data.
+
+
+Scenario 1
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *wide* :math:`d_1 \times m` row-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate a 
+   :math:`d_2\times m` row-wise :math:`\denseskop` :math:`\mtx{S}_2` in such a way that
+   :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` is the same as the :math:`(d_1 + d_2) \times m` row-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This scenario arises if we have a fixed data matrix :math:`\mtx{A}`, an initial
+sketch :math:`\mtx{B}_1 = \mtx{S}_1 \mtx{A}`, and we decide we want a larger sketch for
+statistical reasons. The updated sketch :math:`\mtx{B} = \mtx{S} \mtx{A}` can be expressed as
+
+.. math::
+
+    \mtx{B} = \begin{bmatrix} \mtx{S}_1 \mtx{A} \\ \mtx{S}_2 \mtx{A} \end{bmatrix}.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then the final sketch
+:math:`\mtx{B} = \mtx{S}\mtx{A}` will be the same regardless of whether we computed the sketch
+in one step or two steps. This is desirable for benchmarking and
+debugging RandNLA algorithms where the sketch size is a tuning parameter.
+
+Scenario 2
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *wide* :math:`d \times m_1` column-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate a 
+   :math:`d \times m_2` column-wise :math:`\denseskop` :math:`\mtx{S}_2` so that 
+   :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]` is the same as the :math:`d \times (m_1 + m_2)` column-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This situation arises if we have an initial data matrix :math:`\mtx{A}_1`, an
+initial sketch :math:`\mtx{B}_1 = \mtx{S}_1 \mtx{A}_1`, and then a new matrix :math:`\mtx{A}_2` arrives in
+such a way that we want a sketch of :math:`A = [\mtx{A}_1; \mtx{A}_2]`. To compute :math:`\mtx{B} = \mtx{S}\mtx{A}`, 
+we update :math:`\mtx{B}_1` according to the formula
+
+.. math::
+
+   \mtx{B} = \begin{bmatrix} \mtx{S}_1 & \mtx{S}_2 \end{bmatrix} \begin{bmatrix} \mtx{A}_1 \\ \mtx{A}_2 \end{bmatrix} = \mtx{B}_1 + \mtx{S}_2 \mtx{A}_2.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then :math:`\mtx{B}` will be the
+same as though we started with all of :math:`\mtx{A}` from the very beginning. This
+is useful for benchmarking and debugging RandNLA algorithms that involve
+operating on data matrices that increase in size over some number of iterations.
+
+.. _scenario-3:
+
+Scenario 3
+==========
+
+   Let :math:`\mtx{S}_1` be a *tall* :math:`n \times d_1` column-wise :math:`\denseskop`
+   with seed :math:`c`. We can easily generate an :math:`n\times d_2` column-wise
+   :math:`\denseskop` :math:`\mtx{S}_2` so that :math:`\mtx{S} = [\mtx{S}_1, \mtx{S}_2]` is the same
+   as the :math:`d \times (n_1 + n_2)` column-wise :math:`\denseskop` with seed :math:`c`.
+
+This arises we have a fixed data matrix :math:`\mtx{A}`, an initial sketch :math:`\mtx{B}_1 = \mtx{A} \mtx{S}_1`,
+and we decide we want a larger sketch for statistical reasons. The
+updated sketch :math:`\mtx{B} = \mtx{A}\mtx{S}` can be expressed as
+
+.. math::
+
+    \mtx{B} = \begin{bmatrix} \mtx{A} \mtx{S}_1 & \mtx{A} \mtx{S}_2 \end{bmatrix}.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then the final sketch
+:math:`B` will be the same regardless of whether we computed the sketch in one
+step or two steps. This is desirable for benchmarking and debugging
+RandNLA algorithms where the sketch size is a tuning parameter.
+
+.. _scenario-4:
+
+Scenario 4
+==========
+
+   Suppose :math:`\mtx{S}_1` is a *tall* :math:`n_1 \times d` row-wise
+   :math:`\denseskop` with seed :math:`c`. It's easy to generate an :math:`n_2\times d`
+   row-wise :math:`\denseskop` :math:`\mtx{S}_2` in such a way that
+   :math:`\mtx{S} = [\mtx{S}_1; \mtx{S}_2]` is the same as the :math:`(n_1 + n_2) \times d` row-wise
+   :math:`\denseskop` with seed :math:`c`.
+
+This situation arises if we have an initial data matrix :math:`\mtx{A}_1`, an initial sketch 
+:math:`\mtx{B}_1 = \mtx{A}_1 \mtx{S}_1`, and then a new matrix :math:`\mtx{A}_2` arrives in such a way that we 
+want a sketch of :math:`\mtx{A} = [\mtx{A}_1, \mtx{A}_2]`. To compute :math:`\mtx{B} = \mtx{A}\mtx{S}`, we update :math:`\mtx{B}_1` 
+according to the formula
+
+.. math::
+
+   \mtx{B} = \begin{bmatrix} \mtx{A}_1 & \mtx{A}_2 \end{bmatrix} \begin{bmatrix} \mtx{S}_1 \\ \mtx{S}_2 \end{bmatrix} = \mtx{B}_1 + \mtx{A}_2 \mtx{S}_2.
+
+If :math:`(\mtx{S}_1, \mtx{S}_2, \mtx{S})` satisfy the assumptions above, then :math:`\mtx{B}` will be the same as though
+we started with all of :math:`\mtx{A}` from the very beginning. This is useful for benchmarking and 
+debugging RandNLA algorithms that involve operating on data matrices that increase in size over 
+some number of iterations.
diff --git a/test/test_datastructures/test_sparseskop.cc b/test/test_datastructures/test_sparseskop.cc
index 9843cb51..7709de52 100644
--- a/test/test_datastructures/test_sparseskop.cc
+++ b/test/test_datastructures/test_sparseskop.cc
@@ -77,7 +77,7 @@ class TestSparseSkOpConstruction : public ::testing::Test
 
     template <RandBLAS::SignedInteger sint_t>
     void proper_saso_construction(int64_t d, int64_t m, int64_t key_index, int64_t nnz_index) {
-        using RNG = RandBLAS::SparseSkOp<float>::RNG_t;
+        using RNG = RandBLAS::SparseSkOp<float>::state_t::generator;
         RandBLAS::SparseSkOp<float, RNG, sint_t> S0(
             {d, m, vec_nnzs[nnz_index], RandBLAS::MajorAxis::Short}, keys[key_index]
         );
@@ -91,7 +91,7 @@ class TestSparseSkOpConstruction : public ::testing::Test
 
     template <RandBLAS::SignedInteger sint_t>
     void proper_laso_construction(int64_t d, int64_t m, int64_t key_index, int64_t nnz_index) {
-        using RNG = RandBLAS::SparseSkOp<float>::RNG_t;
+        using RNG = RandBLAS::SparseSkOp<float>::state_t::generator;
         RandBLAS::SparseSkOp<float, RNG, sint_t> S0(
             {d, m, vec_nnzs[nnz_index], RandBLAS::MajorAxis::Long}, keys[key_index]
         );

From 973e32741fd5a27178718f9d9ef76c5f6ddc776f Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 22 Aug 2024 09:25:11 -0400
Subject: [PATCH 37/37] create an easier-to-understand overload of
 repeated_fisher_yates. Make meta-tests for power iteration implementation
 cheaper (by virtue of testing against the same tolerance but running fewer
 iterations, so the test is now more adversarial).

---
 RandBLAS/sparse_skops.hh             | 15 +++++++++++++--
 test/test_basic_rng/test_discrete.cc | 12 ++++--------
 test/test_handrolled_lapack.cc       |  6 +++++-
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/RandBLAS/sparse_skops.hh b/RandBLAS/sparse_skops.hh
index 6e90651e..b6c85fcc 100644
--- a/RandBLAS/sparse_skops.hh
+++ b/RandBLAS/sparse_skops.hh
@@ -60,6 +60,8 @@ static RNGState<RNG> repeated_fisher_yates(
     sint_t *idxs_minor,
     T *vals
 ) {
+    bool write_vals = vals != nullptr;
+    bool write_idxs_minor = idxs_minor != nullptr;
     randblas_error_if(vec_nnz > dim_major);
     std::vector<sint_t> vec_work(dim_major);
     for (sint_t j = 0; j < dim_major; ++j)
@@ -81,8 +83,10 @@ static RNGState<RNG> repeated_fisher_yates(
             vec_work[j] = swap;
             // update (rows, cols, vals)
             idxs_major[j + offset] = (sint_t) swap;
-            vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
-            idxs_minor[j + offset] = (sint_t) i;
+            if (write_vals)
+                vals[j + offset] = (rv[1] % 2 == 0) ? 1.0 : -1.0;
+            if (write_idxs_minor)
+                idxs_minor[j + offset] = (sint_t) i;
             // increment counter
             ctr_work.incr();
         }
@@ -101,6 +105,13 @@ static RNGState<RNG> repeated_fisher_yates(
     return RNGState<RNG> {ctr, key};
 }
 
+template <typename RNG, SignedInteger sint_t>
+inline RNGState<RNG> repeated_fisher_yates(
+    const RNGState<RNG> &state, int64_t k, int64_t n, int64_t r, sint_t *indices
+) {
+    return repeated_fisher_yates(state, k, n, r, indices, (sint_t*) nullptr, (double*) nullptr);
+}
+
 template <typename RNG, typename SD>
 RNGState<RNG> compute_next_state(SD dist, RNGState<RNG> state) {
     int64_t minor_len;
diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc
index 68e80162..92014d29 100644
--- a/test/test_basic_rng/test_discrete.cc
+++ b/test/test_basic_rng/test_discrete.cc
@@ -212,7 +212,7 @@ class TestSampleIndices : public ::testing::Test
     }
 
     static void single_test_fisher_yates_kolmogorov_smirnov(int64_t N, int64_t K, double significance, int64_t num_samples, uint32_t seed) {
-        using RandBLAS::repeated_fisher_yates;
+        using RandBLAS::sparse::repeated_fisher_yates;
         using RandBLAS_StatTests::hypergeometric_pmf_arr;
         using RandBLAS::util::weights_to_cdf;
         using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
@@ -220,22 +220,18 @@ class TestSampleIndices : public ::testing::Test
         auto critical_value = critical_value_rep_mutator(num_samples, significance);
 
         // Initialize arguments for repeated_fisher_yates
-        std::vector<int64_t> idxs_major(K * num_samples);
-        std::vector<int64_t> idxs_minor(K * num_samples);
-        std::vector<double> vals(K * num_samples);
+        std::vector<int64_t> indices(K * num_samples);
         RNGState state(seed);
 
         // Generate repeated Fisher-Yates in idxs_major
-        state = repeated_fisher_yates(
-            state, K, N, num_samples, // K=vec_nnz, N=dim_major, num_samples=dim_minor
-            idxs_major.data(), idxs_minor.data(), vals.data());
+        state = repeated_fisher_yates(state, K, N, num_samples, indices.data());
 
         // Generate the true hypergeometric cdf (get the pdf first)
         std::vector<float> true_cdf = hypergeometric_pmf_arr<float>(N, K, K);
         weights_to_cdf(true_cdf.size(), true_cdf.data());
 
         // Compute the critval and check against it
-        fisher_yates_kolmogorov_smirnov_tester(idxs_major, true_cdf, critical_value, N, K, num_samples);
+        fisher_yates_kolmogorov_smirnov_tester(indices, true_cdf, critical_value, N, K, num_samples);
 
         return;
     }
diff --git a/test/test_handrolled_lapack.cc b/test/test_handrolled_lapack.cc
index 3e324684..3e1aa2eb 100644
--- a/test/test_handrolled_lapack.cc
+++ b/test/test_handrolled_lapack.cc
@@ -318,7 +318,11 @@ class TestHandrolledEigvals : public ::testing::Test {
         std::vector<T> subwork{};
         T tol = 1e-3;
         RNGState state(key + 1);
-        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, (T) 1e-6, state, subwork);
+        T p_fail = (n < 500) ? (T) 1e-6 : (T) 0.5;
+        // ^ This affects the number of iterations that will be used in the power method. That number is
+        //   max{#iterations to succeed in expectation, #iterations to succeed with some probability}.
+        //   Large values of p_fail (like p_fail = 0.5) just say "succeed in expectation."
+        auto [lambda_max, lambda_min, ignore] = hr_lapack::exeigs_powermethod(n, A.data(), ourwork.data(), tol, p_fail, state, subwork);
 
         std::cout << "min_comp / min_actual = " << lambda_min / eigvals_expect[n-1] << std::endl;
         std::cout << "max_comp / max_actual = " << lambda_max / eigvals_expect[0] << std::endl;