From 2872d1f5b811133a20103831df9e45306e611746 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Tue, 1 Oct 2024 10:14:17 -0700
Subject: [PATCH 01/26] init-commit, initial `TrigSkOp` structure

---
 RandBLAS/trig_skops.hh | 318 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 318 insertions(+)
 create mode 100644 RandBLAS/trig_skops.hh
diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
new file mode 100644
index 0000000..896a64f
--- /dev/null
+++ b/RandBLAS/trig_skops.hh
@@ -0,0 +1,318 @@
+#include "RandBLAS/base.hh"
+#include "RandBLAS/exceptions.hh"
+#include "RandBLAS/random_gen.hh"
+#include <RandBLAS/sparse_skops.hh>
+
+#include <Random123/philox.h>
+#include <blas.hh>
+#include <lapack.hh>
+#include <omp.h>
+
+#include <iostream>
+#include <stdio.h>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <random>
+
+#include <math.h>
+#include <typeinfo>
+
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+namespace RandBLAS {
+
+    // =============================================================================
+    /// WARNING: None of the following functions or overloads thereof are part of the
+    /// public API
+    ///
+    void generateRademacherVector_r123(std::vector<int64_t> &buff, uint32_t key_seed, uint32_t ctr_seed, int n) {
+        typedef r123::Philox2x32 RNG;
+        // std::vector<int64_t> rademacherVector(n);
+
+        // Use OpenMP to parallelize the Rademacher vector generation
+        #pragma omp parallel
+        {
+            // Each thread has its own RNG instance
+            RNG rng;
+
+            // You can use any seed value for the key
+            RNG::key_type k = {{key_seed + omp_get_thread_num()}}; // Unique key for each thread
+
+            // Thread-local counter
+            RNG::ctr_type c;
+
+            // Parallel for loop
+            #pragma omp for
+            for (int i = 0; i < n; ++i) {
+                // Set the counter for each random number (unique per thread)
+                c[0] = ctr_seed + i; // Ensure the counter is unique across threads
+
+                // Generate a 2x32-bit random number using the Philox generator
+                RNG::ctr_type r = rng(c, k);
+
+                // Convert the random number into a float in [0, 1) using u01fixedpt
+                float randValue = r123::u01fixedpt<float>(r.v[0]);
+
+                // Convert the float into a Rademacher entry (-1 or 1)
+                buff[i] = randValue < 0.5 ? -1 : 1;
+            }
+        }
+    }
+
+    std::vector<int64_t> generateRademacherVector_parallel(int64_t n) {
+        std::vector<int64_t> rademacherVec(n);
+
+        #pragma omp parallel
+        {
+            // Each thread gets its own random number generator
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::bernoulli_distribution bernoulli(0.5);
+
+            #pragma omp for
+            for (int64_t i = 0; i < n; ++i) {
+                rademacherVec[i] = bernoulli(gen) ? 1 : -1;
+            }
+        }
+
+        return rademacherVec;
+    }
+
+    template<typename T>
+    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A) {
+        std::vector<int64_t> diag = generateRademacherVector_parallel(cols);
+
+        for(int col=0; col < cols; col++) {
+        if(diag[col] > 0)
+            continue;
+        blas::scal(rows, diag[col], &A[col * rows], 1);
+    }
+    }
+
+    template<typename T>
+    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, std::vector<int64_t> &diag) {
+        //NOTE: Only considers sketching from the left + ColMajor format as of now
+        for(int col=0; col < cols; col++) {
+        if(diag[col] > 0)
+            continue;
+        blas::scal(rows, diag[col], &A[col * rows], 1);
+    }
+    }
+
+    // `applyDiagonalRademacher`should have a similar API as `sketch_general`
+    // Will be called inside of `lskget`
+    // B <- alpha * diag(Rademacher) * op(A) + beta * B
+    // `alpha`, `beta` aren't needed and shape(A) == shape(B)
+    // lda == ldb
+    template<typename T>
+    void applyDiagonalRademacher(
+        blas::Layout layout,
+        blas::Op opA, // taken from `sketch_general/lskget`
+        int64_t n, // everything is the same size as `op(A)`: m-by-n
+        int64_t m,
+        std::vector<int64_t> &diag,
+        const T* A, // The data matrix that won't be modified
+        // int64_t lda,
+        T* B // The destination matrix
+        // int64_t ldb
+    ) {
+        // B <- alpha * diag(Rademacher) * op(A) + beta * B
+        // `alpha`, `beta` aren't needed
+        // && shape(A) == shape(B) && lda == ldb && shape({A | B}) = m-by-n
+        //NOTE: Use `layout` and `opA` for working with RowMajor data
+        int64_t lda = m;
+        int64_t ldb = m;
+
+        //NOTE: Should the copy be made inside of `applyDiagonalRademacher` or
+        // should it be inside of `lskget`?
+        lapack::lacpy(lapack::MatrixType::General, m, n, A, lda, B, ldb);
+
+        applyDiagonalRademacher(m, n, B, diag);
+    }
+
+
+    // `permuteRowsToTop` should just take in `B` as an input
+    // `B` will be passed in by the user to `sketch_general/lskget` and `permuteRowsToTop` will take in
+    // the `B` that has been modified by `applyRademacherDiagonal`
+    // Will also be called inside of `lskget`
+    template<typename T>
+    void permuteRowsToTop(int rows, int cols, std::vector<int64_t> selectedRows, T* B, int ldb) {
+        //NOTE: There should be a similar `permuteColsToLeft`
+        int top = 0;  // Keeps track of the topmost unselected row
+
+        // Compute `r` internally
+        // int64_t r = 100;
+        // std::vector<int64_t> selectedRows = generateRademacherVector_parallel(r);
+
+        for (int selected : selectedRows) {
+            if (selected != top) {
+                // Use BLAS swap to swap the entire rows
+                // Swapping row 'selected' with row 'top'
+                blas::swap(cols, &B[selected], ldb, &B[top], ldb);
+            }
+            top++;
+        }
+    }
+
+    enum class TrigDistName: char {
+        Fourier = 'F',
+
+        // ---------------------------------------------------------------------------
+
+        Hadamard = 'H'
+    };
+
+    struct TrigDist {
+        const int64_t n_rows;
+        const int64_t n_cols;
+
+        int64_t dim_short;
+        int64_t dim_long;
+
+        const TrigDistName family;
+
+        TrigDist(
+            int64_t n_rows,
+            int64_t n_cols,
+            TrigDistName tn = TrigDistName::Hadamard
+        ) : n_rows(n_rows), n_cols(n_cols), family(tn) {
+            dim_short = MIN(n_rows, n_cols);
+            dim_long = MAX(n_rows, n_cols);
+}
+};
+
+    template<typename T, SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    struct TrigSkOp {
+        using generator = RNG;
+        using state_type = RNGState<RNG>;
+        using buffer_type = T;
+
+        const int64_t n_rows;
+        const int64_t n_cols;
+        int64_t dim_short;
+        int64_t dim_long;
+        // int64_t n_sampled;
+
+        const TrigDist dist;
+
+
+        const RNGState<RNG> seed_state;
+
+        RNGState<RNG> next_state;
+
+        const blas::Layout layout;
+        const bool sketchFromLeft = true;
+        bool known_filled = false;
+
+        T* DiagonalRademacher = nullptr;
+        T* SampledRows = nullptr;
+
+        TrigSkOp(
+            TrigDist dist,
+            RNGState<RNG> const &state,
+            bool known_filled = true
+        ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(state), known_filled(known_filled), dim_short(dist.dim_short), dim_long(dist.dim_long){
+            // Memory for Rademacher diagonal gets allocated here
+            // Number of rows/cols to be sampled gets decided here
+            // i.e. `n_sampled` gets set
+            if(sketchFromLeft)
+                DiagonalRademacher = new T[n_rows];
+            else
+                DiagonalRademacher = new T[n_cols];
+
+            SampledRows = new sint_t[dim_short];
+
+            //TODO: Logic to compute the number of samples that we require `r`
+            // this can be shown to depend on the maximal row norm of the data matrix
+            //NOTE: Do not have access to this in the data-oblivious regime --- how
+            // do we get access?
+};
+
+        TrigSkOp(
+            TrigDistName family,
+            int64_t n_rows,
+            int64_t n_cols,
+            uint32_t key
+        ) : n_rows(n_rows), n_cols(n_cols), dist(TrigDist{n_rows, n_cols, family}), seed_state() {};
+
+        ~TrigSkOp();
+};
+
+// Populates the Rademacher diagonal entries and decides the rows/cols to
+// be sampled at the end
+template<typename T, typename RNG, SignedInteger sint_t>
+RandBLAS::RNGState<RNG> fill_trig(
+    TrigSkOp<T, RNG> &Tr
+) {
+    /**
+     * Will do the work of filling in the diagonal Rademacher entries
+     * and selecting the rows/cols to be sampled
+     */
+
+    auto [ctr, key] = Tr.seed_state;
+
+    // Fill in the Rademacher diagonal
+    if(Tr.sketchFromLeft)
+        generateRademacherVector_r123(Tr.DiagonalRademacher, key, ctr, Tr.n_rows);
+    else
+        generateRademacherVector_r123(Tr.DiagonalRademacher, key, ctr, Tr.n_cols);
+
+    //NOTE: Select the rows/cols to be sampled --- use the `repeated_fisher_yates` function
+    int64_t r = Tr.dim_short;
+    int64_t d = Tr.dim_long;
+
+    // Output containers
+    std::vector<sint_t> idxs_major(r); // Stores the sampled integers
+    std::vector<sint_t> idxs_minor(r); // Not needed in your case
+    std::vector<T> vals(r); // Not needed in your case, but required by the function
+
+    RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+        Tr.seed_state,
+        r,         // Number of samples (vec_nnz)
+        d,         // Total number of elements (dim_major)
+        1,         // Single sample round (dim_minor)
+        Tr.SampledRows,  // Holds the required output
+        idxs_minor.data(),  // Placeholder
+        vals.data()         // Placeholder
+    );
+}
+}
+
+
+namespace RandBLAS::trig {
+
+// Performs the actual application of the fast trigonometric transform
+// Only called after calling `fill_trig`
+template <typename T, typename RNG>
+inline void lskget(
+    blas::Layout layout,
+    blas::Op opS,
+    blas::Op opA,
+    int64_t d, // B is d-by-n
+    int64_t n, // op(A) is m-by-n
+    int64_t m, // op(S) is d-by-m
+    T alpha,
+    TrigSkOp<T, RNG> &Tr,
+    int64_t ro_s,
+    int64_t co_s,
+    const T* A, // data-matrix
+    int64_t lda,
+    T beta,
+    T* B, // output matrix
+    int64_t ldb
+) {
+    if (!Tr.known_filled)
+        fill_trig(Tr);
+
+    // Applying the diagonal transformation
+    applyDiagonalRademacher(layout, opA, n, m, Tr.DiagonalRademacher, A, B);
+    // applyDiagonalRademacher(m, n, A, Tr.DiagonalRademacher);
+
+    //TODO: Apply the Hadamard transform
+
+    //... and finally permute the rows
+    permuteRowsToTop(m, n, Tr.SampledRows, B, ldb);
+}
+}

From 7103a00cd8ba90baabaeace2b6fe68513acb0f4c Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Tue, 1 Oct 2024 10:44:42 -0700
Subject: [PATCH 02/26] Cleaning

---
 RandBLAS/trig_skops.hh | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 896a64f..696798f 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -37,7 +37,6 @@ namespace RandBLAS {
             // Each thread has its own RNG instance
             RNG rng;
 
-            // You can use any seed value for the key
             RNG::key_type k = {{key_seed + omp_get_thread_num()}}; // Unique key for each thread
 
             // Thread-local counter
@@ -263,10 +262,8 @@ RandBLAS::RNGState<RNG> fill_trig(
     int64_t r = Tr.dim_short;
     int64_t d = Tr.dim_long;
 
-    // Output containers
-    std::vector<sint_t> idxs_major(r); // Stores the sampled integers
-    std::vector<sint_t> idxs_minor(r); // Not needed in your case
-    std::vector<T> vals(r); // Not needed in your case, but required by the function
+    std::vector<sint_t> idxs_minor(r); // Placeholder
+    std::vector<T> vals(r); // Placeholder
 
     RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
         Tr.seed_state,

From 5c1572dde9a75a3c7fb60539f05cfdd7fe473039 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Mon, 7 Oct 2024 13:50:30 -0700
Subject: [PATCH 03/26] Commiting old changes for documenting

---
 RandBLAS/trig_skops.hh | 56 ++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 696798f..5b6a747 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -27,7 +27,9 @@ namespace RandBLAS {
     /// WARNING: None of the following functions or overloads thereof are part of the
     /// public API
     ///
-    void generateRademacherVector_r123(std::vector<int64_t> &buff, uint32_t key_seed, uint32_t ctr_seed, int n) {
+    template<SignedInteger sint_t>
+    // void generateRademacherVector_r123(std::vector<int64_t> &buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
+    void generateRademacherVector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
         typedef r123::Philox2x32 RNG;
         // std::vector<int64_t> rademacherVector(n);
 
@@ -90,8 +92,9 @@ namespace RandBLAS {
     }
     }
 
-    template<typename T>
-    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, std::vector<int64_t> &diag) {
+    template<typename T, SignedInteger sint_t>
+    // void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, std::vector<int64_t> &diag) {
+    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, sint_t* diag) {
         //NOTE: Only considers sketching from the left + ColMajor format as of now
         for(int col=0; col < cols; col++) {
         if(diag[col] > 0)
@@ -105,13 +108,14 @@ namespace RandBLAS {
     // B <- alpha * diag(Rademacher) * op(A) + beta * B
     // `alpha`, `beta` aren't needed and shape(A) == shape(B)
     // lda == ldb
-    template<typename T>
+    template<typename T, SignedInteger sint_t>
     void applyDiagonalRademacher(
         blas::Layout layout,
         blas::Op opA, // taken from `sketch_general/lskget`
         int64_t n, // everything is the same size as `op(A)`: m-by-n
         int64_t m,
-        std::vector<int64_t> &diag,
+        // std::vector<int64_t> &diag,
+        sint_t* diag,
         const T* A, // The data matrix that won't be modified
         // int64_t lda,
         T* B // The destination matrix
@@ -182,12 +186,15 @@ namespace RandBLAS {
 }
 };
 
-    template<typename T, SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    template<typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
     struct TrigSkOp {
         using generator = RNG;
         using state_type = RNGState<RNG>;
         using buffer_type = T;
 
+        //TODO: Where should the logic for deciding the size of `H` to use go?
+        // Since that will be accompanied by padding of (DA) maybe it should
+        // go inside of `lskget`?
         const int64_t n_rows;
         const int64_t n_cols;
         int64_t dim_short;
@@ -198,28 +205,28 @@ namespace RandBLAS {
 
 
         const RNGState<RNG> seed_state;
-
         RNGState<RNG> next_state;
 
         const blas::Layout layout;
         const bool sketchFromLeft = true;
         bool known_filled = false;
 
-        T* DiagonalRademacher = nullptr;
-        T* SampledRows = nullptr;
+        sint_t* DiagonalRademacher = nullptr;
+        sint_t* SampledRows = nullptr;
 
         TrigSkOp(
             TrigDist dist,
             RNGState<RNG> const &state,
-            bool known_filled = true
-        ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(state), known_filled(known_filled), dim_short(dist.dim_short), dim_long(dist.dim_long){
+            blas::Layout layout,
+            bool known_filled = false
+        ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(state), known_filled(known_filled), dim_short(dist.dim_short), dim_long(dist.dim_long), layout(layout){
             // Memory for Rademacher diagonal gets allocated here
             // Number of rows/cols to be sampled gets decided here
             // i.e. `n_sampled` gets set
             if(sketchFromLeft)
-                DiagonalRademacher = new T[n_rows];
+                DiagonalRademacher = new sint_t[n_rows];
             else
-                DiagonalRademacher = new T[n_cols];
+                DiagonalRademacher = new sint_t[n_cols];
 
             SampledRows = new sint_t[dim_short];
 
@@ -236,27 +243,32 @@ namespace RandBLAS {
             uint32_t key
         ) : n_rows(n_rows), n_cols(n_cols), dist(TrigDist{n_rows, n_cols, family}), seed_state() {};
 
+        //TODO: Write a proper deconstructor
+        //Free up DiagonalRademacher && SampledRows
         ~TrigSkOp();
 };
 
-// Populates the Rademacher diagonal entries and decides the rows/cols to
-// be sampled at the end
+template <typename T, typename RNG, SignedInteger sint_t>
+TrigSkOp<T, RNG, sint_t>::~TrigSkOp() {
+    delete [] this->DiagonalRademacher;
+    delete [] this->SampledRows;
+};
+
 template<typename T, typename RNG, SignedInteger sint_t>
 RandBLAS::RNGState<RNG> fill_trig(
-    TrigSkOp<T, RNG> &Tr
+    TrigSkOp<T, RNG, sint_t> &Tr
 ) {
     /**
      * Will do the work of filling in the diagonal Rademacher entries
      * and selecting the rows/cols to be sampled
      */
-
     auto [ctr, key] = Tr.seed_state;
 
     // Fill in the Rademacher diagonal
     if(Tr.sketchFromLeft)
-        generateRademacherVector_r123(Tr.DiagonalRademacher, key, ctr, Tr.n_rows);
+        generateRademacherVector_r123(Tr.DiagonalRademacher, key[0], ctr[0], Tr.n_rows);
     else
-        generateRademacherVector_r123(Tr.DiagonalRademacher, key, ctr, Tr.n_cols);
+        generateRademacherVector_r123(Tr.DiagonalRademacher, key[0], ctr[0], Tr.n_cols);
 
     //NOTE: Select the rows/cols to be sampled --- use the `repeated_fisher_yates` function
     int64_t r = Tr.dim_short;
@@ -265,7 +277,7 @@ RandBLAS::RNGState<RNG> fill_trig(
     std::vector<sint_t> idxs_minor(r); // Placeholder
     std::vector<T> vals(r); // Placeholder
 
-    RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+    Tr.next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
         Tr.seed_state,
         r,         // Number of samples (vec_nnz)
         d,         // Total number of elements (dim_major)
@@ -274,6 +286,8 @@ RandBLAS::RNGState<RNG> fill_trig(
         idxs_minor.data(),  // Placeholder
         vals.data()         // Placeholder
     );
+    Tr.known_filled = true;
+    return Tr.next_state;
 }
 }
 
@@ -310,6 +324,6 @@ inline void lskget(
     //TODO: Apply the Hadamard transform
 
     //... and finally permute the rows
-    permuteRowsToTop(m, n, Tr.SampledRows, B, ldb);
+    // permuteRowsToTop(m, n, Tr.SampledRows, B, ldb);
 }
 }

From a60490942f6d48da6026e0c7b0fc61451bb6108b Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 9 Oct 2024 00:18:23 -0700
Subject: [PATCH 04/26] `A <- (\Pi D) A`, for `ColMajor` A

---
 RandBLAS/trig_skops.hh | 104 +++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 15 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 5b6a747..345604e 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -94,13 +94,26 @@ namespace RandBLAS {
 
     template<typename T, SignedInteger sint_t>
     // void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, std::vector<int64_t> &diag) {
-    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, sint_t* diag) {
-        //NOTE: Only considers sketching from the left + ColMajor format as of now
-        for(int col=0; col < cols; col++) {
-        if(diag[col] > 0)
-            continue;
-        blas::scal(rows, diag[col], &A[col * rows], 1);
-    }
+    void applyDiagonalRademacher(
+                                blas::Layout layout,
+                                int64_t rows,
+                                int64_t cols,
+                                T* A,
+                                sint_t* diag
+                                )
+    {
+        //TODO: Only considers sketching from the left + ColMajor format as of now
+        if(layout == blas::Layout::ColMajor) {
+            // In a `ColMajor` setting we parallelize over columns
+            for(int col=0; col < cols; col++) {
+                if(diag[col] > 0)
+                    continue;
+                blas::scal(rows, diag[col], &A[col * rows], 1);
+            }
+        }
+        else {
+            // In a `RowMajor` setting we vectorize over rows
+        }
     }
 
     // `applyDiagonalRademacher`should have a similar API as `sketch_general`
@@ -140,20 +153,27 @@ namespace RandBLAS {
     // `B` will be passed in by the user to `sketch_general/lskget` and `permuteRowsToTop` will take in
     // the `B` that has been modified by `applyRademacherDiagonal`
     // Will also be called inside of `lskget`
-    template<typename T>
-    void permuteRowsToTop(int rows, int cols, std::vector<int64_t> selectedRows, T* B, int ldb) {
+    template<typename T, SignedInteger sint_t>
+    void permuteRowsToTop(
+                          blas::Layout layout,
+                          int64_t rows,
+                          int64_t cols,
+                          sint_t* selectedRows,
+                          int64_t d, // size of `selectedRows`
+                          T* A
+                          ) {
         //NOTE: There should be a similar `permuteColsToLeft`
         int top = 0;  // Keeps track of the topmost unselected row
 
-        // Compute `r` internally
-        // int64_t r = 100;
-        // std::vector<int64_t> selectedRows = generateRademacherVector_parallel(r);
+        int64_t lda = rows;
+        if(layout == blas::Layout::RowMajor)
+            lda = cols;
 
-        for (int selected : selectedRows) {
-            if (selected != top) {
+        for (int i=0; i < d; i++) {
+            if (selectedRows[i] != top) {
                 // Use BLAS swap to swap the entire rows
                 // Swapping row 'selected' with row 'top'
-                blas::swap(cols, &B[selected], ldb, &B[top], ldb);
+                blas::swap(cols, &A[selectedRows[i]], lda, &A[top], lda);
             }
             top++;
         }
@@ -326,4 +346,58 @@ inline void lskget(
     //... and finally permute the rows
     // permuteRowsToTop(m, n, Tr.SampledRows, B, ldb);
 }
+
+
+/*
+ * These functions apply an in-place, SRHT-like transform to the input matrix
+ * i.e. A <- (\Pi H D)A OR A <- A(D H \Pi) (which is equivalent to A <- A(\Pi H D)^{-1})
+ * layout: Layout of the input matrix (`ColMajor/RowMajor`)
+ * A: (m x n), input dimensions of `A`
+ * d: The number of rows/columns that will be permuted by the action of $\Pi$
+ */
+template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
+inline void lmiget(
+    blas::Layout layout,
+    RandBLAS::RNGState<RNG> random_state,
+    int64_t m, // `A` is `(m x n)`
+    int64_t n,
+    int64_t d, // `d` is the number of rows that have to be permuted by `\Pi`
+    T* A // data-matrix
+)
+{
+    // Size of the Rademacher entries = |A_cols|
+    //TODO: Change `diag` to float/doubles (same data type as the matrix)
+    sint_t* diag = new sint_t[n];
+    sint_t* selected_rows = new sint_t[d];
+
+    auto [ctr, key] = random_state;
+
+    //Step 1: Scale with `D`
+        //Populating `diag`
+    generateRademacherVector_r123(diag, key[0], ctr[0], n);
+    applyDiagonalRademacher(layout, m, n, A, diag);
+
+    //Step 2: Apply the Hadamard transform
+
+    //Step 3: Permute the rows
+    std::vector<sint_t> idxs_minor(d); // Placeholder
+    std::vector<T> vals(d); // Placeholder
+
+        // Populating `selected_rows`
+        //TODO: Do I return this at some point?
+    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+        random_state,
+        d,         // Number of samples (vec_nnz)
+        m,         // Total number of elements (dim_major)
+        1,         // Single sample round (dim_minor)
+        selected_rows,  // Holds the required output
+        idxs_minor.data(),  // Placeholder
+        vals.data()         // Placeholder
+    );
+
+    permuteRowsToTop(layout, m, n, selected_rows, d, A);
+
+    free(diag);
+    free(selected_rows);
+}
 }

From 1caaffeae443255a6aa0a9bf817131ccade60c3b Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 9 Oct 2024 11:04:18 -0700
Subject: [PATCH 05/26] `A <- (\Pi H D) A` for `A: (mxn)` and `ColMajor`

---
 RandBLAS/trig_skops.hh | 44 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 345604e..e548a3b 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -179,6 +179,45 @@ namespace RandBLAS {
         }
     }
 
+    void fht_left_col_major(double *buf, int log_n, int num_rows, int num_cols) {
+        // No Padding of the columns in this implementation,
+        // the #rows must exactly be a power of 2
+        // Padding would be straight-forward to address
+        int n = 1 << log_n;
+        std::cout << n << std::endl;
+
+        // Apply FHT to each column independently
+        for (int col = 0; col < num_cols; ++col) {
+            // Pointer to the beginning of the current column in the Column-Major order
+            double* col_buf = buf + col * num_rows;
+
+            // Apply the original FHT on this column
+            for (int i = 0; i < log_n; ++i) {
+                int s1 = 1 << i;
+                int s2 = s1 << 1;
+                for (int j = 0; j < n; j += s2) {
+                    for (int k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_rows;
+                        bool b2 = j + k + s1 < num_rows;
+                        double u = b1 ? col_buf[j + k] : 0;
+                        double v = b2 ? col_buf[j + k + s1] : 0;
+                        if(b1 && b2) {
+                            col_buf[j + k] = u + v;
+                            col_buf[j + k + s1] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            col_buf[j + k] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
     enum class TrigDistName: char {
         Fourier = 'F',
 
@@ -375,9 +414,10 @@ inline void lmiget(
     //Step 1: Scale with `D`
         //Populating `diag`
     generateRademacherVector_r123(diag, key[0], ctr[0], n);
-    applyDiagonalRademacher(layout, m, n, A, diag);
+    // applyDiagonalRademacher(layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
+    fht_left_col_major(A, std::log2(MAX(m, n)), m, n);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder
@@ -395,7 +435,7 @@ inline void lmiget(
         vals.data()         // Placeholder
     );
 
-    permuteRowsToTop(layout, m, n, selected_rows, d, A);
+    // permuteRowsToTop(layout, m, n, selected_rows, d, A);
 
     free(diag);
     free(selected_rows);

From 30c32f313e9c026a0edb37c335408bb846ddaea3 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 9 Oct 2024 13:45:03 -0700
Subject: [PATCH 06/26] {L|R}`RowMajor`, {R}`RowMajor`, implicit padding

---
 RandBLAS/trig_skops.hh | 91 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 5 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index e548a3b..7c29d27 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -180,11 +180,7 @@ namespace RandBLAS {
     }
 
     void fht_left_col_major(double *buf, int log_n, int num_rows, int num_cols) {
-        // No Padding of the columns in this implementation,
-        // the #rows must exactly be a power of 2
-        // Padding would be straight-forward to address
         int n = 1 << log_n;
-        std::cout << n << std::endl;
 
         // Apply FHT to each column independently
         for (int col = 0; col < num_cols; ++col) {
@@ -218,6 +214,88 @@ namespace RandBLAS {
         }
     }
 
+    void fht_left_row_major(double *buf, int log_n, int num_rows, int num_cols) {
+        int n = 1 << log_n;
+
+        // Apply FHT to each column independently
+        for (int col = 0; col < num_cols; ++col) {
+            // Apply the original FHT on this column
+            for (int i = 0; i < log_n; ++i) {
+                int s1 = 1 << i;
+                int s2 = s1 << 1;
+                for (int j = 0; j < n; j += s2) {
+                    for (int k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_rows;
+                        bool b2 = j + k + s1 < num_rows;
+                        double u = b1 ? buf[(j + k) * num_cols + col] : 0;
+                        double v = b2 ? buf[(j + k + s1) * num_cols + col] : 0;
+                        if(b1 && b2) {
+                            buf[(j + k) * num_cols + col] = u + v;
+                            buf[(j + k + s1) * num_cols + col] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            buf[(j + k) * num_cols + col] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    void fht_right_row_major(double *buf, int log_n, int num_rows, int num_cols) {
+        int n = 1 << log_n;
+
+        // Apply FHT to each row independently
+        for (int row = 0; row < num_rows; ++row) {
+            // Pointer to the beginning of the current row in RowMajor order
+            double* row_buf = buf + row * num_cols;
+
+            // Apply the original FHT on this row
+            for (int i = 0; i < log_n; ++i) {
+                int s1 = 1 << i;
+                int s2 = s1 << 1;
+                for (int j = 0; j < n; j += s2) {
+                    for (int k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_cols;
+                        bool b2 = j + k + s1 < num_cols;
+                        double u = b1 ? row_buf[j + k] : 0;
+                        double v = b2 ? row_buf[j + k + s1] : 0;
+                        if(b1 && b2) {
+                            row_buf[j + k] = u + v;
+                            row_buf[j + k + s1] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            row_buf[j + k] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    void fht_right_col_major(double *buf, int log_n, int num_rows, int num_cols) {
+        //TODO:
+    }
+
+    void fht_dispatch(
+        blas::Layout layout,
+        double* buff,
+        int64_t log_n,
+        int64_t num_rows,
+        int64_t num_cols
+        )
+    {
+        //TODO:
+    }
+
     enum class TrigDistName: char {
         Fourier = 'F',
 
@@ -417,7 +495,10 @@ inline void lmiget(
     // applyDiagonalRademacher(layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
-    fht_left_col_major(A, std::log2(MAX(m, n)), m, n);
+    //TODO: Clean via `fht_dispatch`
+    // fht_left_col_major(A, std::log2(MAX(m, n)), m, n);
+    // fht_right_row_major(A, std::log2(MAX(m, n)), m, n);
+    fht_left_row_major(A, std::log2(MAX(m, n)), m, n);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder

From e6013094aa9758d9ce527a8d5c3e2f44ab7cde00 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 9 Oct 2024 18:03:46 -0700
Subject: [PATCH 07/26] All variations of $H_n$ implemented

---
 RandBLAS/trig_skops.hh | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 7c29d27..56c113d 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -282,7 +282,35 @@ namespace RandBLAS {
     }
 
     void fht_right_col_major(double *buf, int log_n, int num_rows, int num_cols) {
-        //TODO:
+        int n = 1 << log_n;
+
+        // Apply FHT to each row independently
+        for (int row= 0; row < num_rows; ++row) {
+            // Apply the original FHT on this column
+            for (int i = 0; i < log_n; ++i) {
+                int s1 = 1 << i;
+                int s2 = s1 << 1;
+                for (int j = 0; j < n; j += s2) {
+                    for (int k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_cols;
+                        bool b2 = j + k + s1 < num_cols;
+                        double u = b1 ? buf[(j + k) * num_rows + row] : 0;
+                        double v = b2 ? buf[(j + k + s1) * num_rows + row] : 0;
+                        if(b1 && b2) {
+                            buf[(j + k) * num_rows + row] = u + v;
+                            buf[(j + k + s1) * num_rows + row] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            buf[(j + k) * num_rows + row] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
     }
 
     void fht_dispatch(
@@ -498,7 +526,8 @@ inline void lmiget(
     //TODO: Clean via `fht_dispatch`
     // fht_left_col_major(A, std::log2(MAX(m, n)), m, n);
     // fht_right_row_major(A, std::log2(MAX(m, n)), m, n);
-    fht_left_row_major(A, std::log2(MAX(m, n)), m, n);
+    // fht_left_row_major(A, std::log2(MAX(m, n)), m, n);
+    fht_right_col_major(A, std::log2(MAX(m, n)), m, n);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder

From 7b8b13457ac5ae89b53ea4560719b85ecca7f572 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 9 Oct 2024 18:47:35 -0700
Subject: [PATCH 08/26] Cleaned code with `fht_dispatch`

---
 RandBLAS/trig_skops.hh | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 56c113d..3466a36 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -314,6 +314,7 @@ namespace RandBLAS {
     }
 
     void fht_dispatch(
+        bool left,
         blas::Layout layout,
         double* buff,
         int64_t log_n,
@@ -322,6 +323,14 @@ namespace RandBLAS {
         )
     {
         //TODO:
+        if(left && layout == blas::Layout::ColMajor)
+            fht_left_col_major(buff, log_n, num_rows, num_cols);
+        else if(left && layout == blas::Layout::RowMajor)
+            fht_left_row_major(buff, log_n, num_rows, num_cols);
+        else if(!left && layout == blas::Layout::ColMajor)
+            fht_right_col_major(buff, log_n, num_rows, num_cols);
+        else
+            fht_right_row_major(buff, log_n, num_rows, num_cols);
     }
 
     enum class TrigDistName: char {
@@ -523,11 +532,7 @@ inline void lmiget(
     // applyDiagonalRademacher(layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
-    //TODO: Clean via `fht_dispatch`
-    // fht_left_col_major(A, std::log2(MAX(m, n)), m, n);
-    // fht_right_row_major(A, std::log2(MAX(m, n)), m, n);
-    // fht_left_row_major(A, std::log2(MAX(m, n)), m, n);
-    fht_right_col_major(A, std::log2(MAX(m, n)), m, n);
+    fht_dispatch(true, layout, A, std::log2(MAX(m, n)), m, n);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder

From 5f72995ffc65d18ef3bb3d3fe885878d3ad29601 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 10 Oct 2024 09:40:47 -0700
Subject: [PATCH 09/26] Generalizing `applyDiagonalRademacher`

---
 RandBLAS/trig_skops.hh | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 3466a36..5fd62d9 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -93,8 +93,8 @@ namespace RandBLAS {
     }
 
     template<typename T, SignedInteger sint_t>
-    // void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A, std::vector<int64_t> &diag) {
     void applyDiagonalRademacher(
+                                bool left,
                                 blas::Layout layout,
                                 int64_t rows,
                                 int64_t cols,
@@ -102,17 +102,33 @@ namespace RandBLAS {
                                 sint_t* diag
                                 )
     {
-        //TODO: Only considers sketching from the left + ColMajor format as of now
-        if(layout == blas::Layout::ColMajor) {
-            // In a `ColMajor` setting we parallelize over columns
+        if(left && layout == blas::Layout::ColMajor) {
             for(int col=0; col < cols; col++) {
                 if(diag[col] > 0)
                     continue;
                 blas::scal(rows, diag[col], &A[col * rows], 1);
             }
         }
+        else if(left && layout == blas::Layout::RowMajor) {
+            for(int col=0; col < cols; col++) {
+                if(diag[col] > 0)
+                    continue;
+                blas::scal(rows, diag[col], &A[col], cols);
+            }
+        }
+        else if(!left && layout == blas::Layout::ColMajor) {
+            for(int row = 0; row < rows; row++) {
+                if(diag[row] > 0)
+                    continue;
+                blas::scal(cols, diag[row], &A[row], rows);
+            }
+        }
         else {
-            // In a `RowMajor` setting we vectorize over rows
+            for(int row = 0; row < rows; row++) {
+                if(diag[row] > 0)
+                    continue;
+                blas::scal(cols, diag[row], &A[row * cols], 1);
+            }
         }
     }
 
@@ -155,6 +171,7 @@ namespace RandBLAS {
     // Will also be called inside of `lskget`
     template<typename T, SignedInteger sint_t>
     void permuteRowsToTop(
+                          bool left,
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
@@ -162,7 +179,8 @@ namespace RandBLAS {
                           int64_t d, // size of `selectedRows`
                           T* A
                           ) {
-        //NOTE: There should be a similar `permuteColsToLeft`
+        //NOTE: There should be a similar `permuteColsToLeft` for sketching
+        //from the right
         int top = 0;  // Keeps track of the topmost unselected row
 
         int64_t lda = rows;
@@ -528,8 +546,8 @@ inline void lmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generateRademacherVector_r123(diag, key[0], ctr[0], n);
-    // applyDiagonalRademacher(layout, m, n, A, diag);
+    generateRademacherVector_r123(diag, key[3], ctr[0], n);
+    applyDiagonalRademacher(true, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
     fht_dispatch(true, layout, A, std::log2(MAX(m, n)), m, n);

From 3bd236100eb4aeb4831e5334597234c711c161cb Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 10 Oct 2024 10:01:41 -0700
Subject: [PATCH 10/26] Generalized `permuteRowsToTop`

---
 RandBLAS/trig_skops.hh | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 5fd62d9..f0a4909 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -171,7 +171,6 @@ namespace RandBLAS {
     // Will also be called inside of `lskget`
     template<typename T, SignedInteger sint_t>
     void permuteRowsToTop(
-                          bool left,
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
@@ -179,21 +178,28 @@ namespace RandBLAS {
                           int64_t d, // size of `selectedRows`
                           T* A
                           ) {
-        //NOTE: There should be a similar `permuteColsToLeft` for sketching
+        //TODO: There should be a similar `permuteColsToLeft` for sketching
         //from the right
-        int top = 0;  // Keeps track of the topmost unselected row
-
-        int64_t lda = rows;
-        if(layout == blas::Layout::RowMajor)
-            lda = cols;
-
-        for (int i=0; i < d; i++) {
-            if (selectedRows[i] != top) {
-                // Use BLAS swap to swap the entire rows
-                // Swapping row 'selected' with row 'top'
-                blas::swap(cols, &A[selectedRows[i]], lda, &A[top], lda);
+        int64_t top = 0;  // Keeps track of the topmost unselected row
+
+        if(layout == blas::Layout::ColMajor) {
+            for (int64_t i=0; i < d; i++) {
+                if (selectedRows[i] != top) {
+                    // Use BLAS swap to swap the entire rows
+                    // Swapping row 'selected' with row 'top'
+                    blas::swap(cols, &A[selectedRows[i]], rows, &A[top], rows);
+                }
+                top++;
+            }
+        }
+        else {
+            // For `RowMajor` ordering
+            for (int64_t i=0; i < d; i++) {
+                if (selectedRows[i] != top) {
+                    blas::swap(cols, &A[cols * selectedRows[i]], 1, &A[cols * top], 1);
+                }
+                top++;
             }
-            top++;
         }
     }
 
@@ -568,7 +574,7 @@ inline void lmiget(
         vals.data()         // Placeholder
     );
 
-    // permuteRowsToTop(layout, m, n, selected_rows, d, A);
+    permuteRowsToTop(layout, m, n, selected_rows, d, A);
 
     free(diag);
     free(selected_rows);

From eec181366441fc5f7cb63359cfebcd01e2e88ef5 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 10 Oct 2024 10:17:08 -0700
Subject: [PATCH 11/26] Added `permuteColsToLeft` && `rmiget`

---
 RandBLAS/trig_skops.hh | 83 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index f0a4909..ec6c8a3 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -203,6 +203,38 @@ namespace RandBLAS {
         }
     }
 
+    template<typename T, SignedInteger sint_t>
+    void permuteColsToLeft(
+                          blas::Layout layout,
+                          int64_t rows,
+                          int64_t cols,
+                          sint_t* selectedCols,
+                          int64_t d, // size of `selectedRows`
+                          T* A
+                          ) {
+        int64_t left = 0;  // Keeps track of the topmost unselected column
+
+        if(layout == blas::Layout::ColMajor) {
+            for (int64_t i=0; i < d; i++) {
+                if (selectedCols[i] != left) {
+                    // Use BLAS::swap to swap entire columns at once
+                    // Swapping col 'selected' with col 'top'
+                    blas::swap(rows, &A[rows * selectedCols[i]], 1, &A[rows * left], 1);
+                }
+                left++;
+            }
+        }
+        else {
+            // For `RowMajor` ordering
+            for (int64_t i=0; i < d; i++) {
+                if (selectedCols[i] != left) {
+                    blas::swap(rows, &A[selectedCols[i]], cols, &A[left], cols);
+                }
+                left++;
+            }
+        }
+    }
+
     void fht_left_col_major(double *buf, int log_n, int num_rows, int num_cols) {
         int n = 1 << log_n;
 
@@ -346,7 +378,6 @@ namespace RandBLAS {
         int64_t num_cols
         )
     {
-        //TODO:
         if(left && layout == blas::Layout::ColMajor)
             fht_left_col_major(buff, log_n, num_rows, num_cols);
         else if(left && layout == blas::Layout::RowMajor)
@@ -552,7 +583,7 @@ inline void lmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generateRademacherVector_r123(diag, key[3], ctr[0], n);
+    generateRademacherVector_r123(diag, key[0], ctr[0], n);
     applyDiagonalRademacher(true, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
@@ -579,4 +610,52 @@ inline void lmiget(
     free(diag);
     free(selected_rows);
 }
+
+
+template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
+inline void rmiget(
+    blas::Layout layout,
+    RandBLAS::RNGState<RNG> random_state,
+    int64_t m, // `A` is `(m x n)`
+    int64_t n,
+    int64_t d, // `d` is the number of cols that have to be permuted by `\Pi`
+    T* A // data-matrix
+)
+{
+    // Size of the Rademacher entries = |A_cols|
+    //TODO: Change `diag` to float/doubles (same data type as the matrix)
+    sint_t* diag = new sint_t[m];
+    sint_t* selected_cols = new sint_t[d];
+
+    auto [ctr, key] = random_state;
+
+    //Step 1: Scale with `D`
+        //Populating `diag`
+    generateRademacherVector_r123(diag, key[0], ctr[0], n);
+    applyDiagonalRademacher(false, layout, m, n, A, diag);
+
+    //Step 2: Apply the Hadamard transform
+    fht_dispatch(false, layout, A, std::log2(MAX(m, n)), m, n);
+
+    //Step 3: Permute the rows
+    std::vector<sint_t> idxs_minor(d); // Placeholder
+    std::vector<T> vals(d); // Placeholder
+
+        // Populating `selected_rows`
+        //TODO: Do I return this at some point?
+    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+        random_state,
+        d,         // Number of samples (vec_nnz)
+        m,         // Total number of elements (dim_major)
+        1,         // Single sample round (dim_minor)
+        selected_cols,  // Holds the required output
+        idxs_minor.data(),  // Placeholder
+        vals.data()         // Placeholder
+    );
+
+    permuteColsToLeft(layout, m, n, selected_cols, d, A);
+
+    free(diag);
+    free(selected_cols);
+}
 }

From 10470bb8f88d400f70f6db7bb431c6ffcd11be4a Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 10 Oct 2024 15:23:04 -0700
Subject: [PATCH 12/26] Removed old `TrigSkOp` code

---
 RandBLAS/trig_skops.hh | 172 +----------------------------------------
 1 file changed, 2 insertions(+), 170 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index ec6c8a3..6461360 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -28,10 +28,8 @@ namespace RandBLAS {
     /// public API
     ///
     template<SignedInteger sint_t>
-    // void generateRademacherVector_r123(std::vector<int64_t> &buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
     void generateRademacherVector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
         typedef r123::Philox2x32 RNG;
-        // std::vector<int64_t> rademacherVector(n);
 
         // Use OpenMP to parallelize the Rademacher vector generation
         #pragma omp parallel
@@ -387,176 +385,10 @@ namespace RandBLAS {
         else
             fht_right_row_major(buff, log_n, num_rows, num_cols);
     }
-
-    enum class TrigDistName: char {
-        Fourier = 'F',
-
-        // ---------------------------------------------------------------------------
-
-        Hadamard = 'H'
-    };
-
-    struct TrigDist {
-        const int64_t n_rows;
-        const int64_t n_cols;
-
-        int64_t dim_short;
-        int64_t dim_long;
-
-        const TrigDistName family;
-
-        TrigDist(
-            int64_t n_rows,
-            int64_t n_cols,
-            TrigDistName tn = TrigDistName::Hadamard
-        ) : n_rows(n_rows), n_cols(n_cols), family(tn) {
-            dim_short = MIN(n_rows, n_cols);
-            dim_long = MAX(n_rows, n_cols);
-}
-};
-
-    template<typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
-    struct TrigSkOp {
-        using generator = RNG;
-        using state_type = RNGState<RNG>;
-        using buffer_type = T;
-
-        //TODO: Where should the logic for deciding the size of `H` to use go?
-        // Since that will be accompanied by padding of (DA) maybe it should
-        // go inside of `lskget`?
-        const int64_t n_rows;
-        const int64_t n_cols;
-        int64_t dim_short;
-        int64_t dim_long;
-        // int64_t n_sampled;
-
-        const TrigDist dist;
-
-
-        const RNGState<RNG> seed_state;
-        RNGState<RNG> next_state;
-
-        const blas::Layout layout;
-        const bool sketchFromLeft = true;
-        bool known_filled = false;
-
-        sint_t* DiagonalRademacher = nullptr;
-        sint_t* SampledRows = nullptr;
-
-        TrigSkOp(
-            TrigDist dist,
-            RNGState<RNG> const &state,
-            blas::Layout layout,
-            bool known_filled = false
-        ) : n_rows(dist.n_rows), n_cols(dist.n_cols), dist(dist), seed_state(state), known_filled(known_filled), dim_short(dist.dim_short), dim_long(dist.dim_long), layout(layout){
-            // Memory for Rademacher diagonal gets allocated here
-            // Number of rows/cols to be sampled gets decided here
-            // i.e. `n_sampled` gets set
-            if(sketchFromLeft)
-                DiagonalRademacher = new sint_t[n_rows];
-            else
-                DiagonalRademacher = new sint_t[n_cols];
-
-            SampledRows = new sint_t[dim_short];
-
-            //TODO: Logic to compute the number of samples that we require `r`
-            // this can be shown to depend on the maximal row norm of the data matrix
-            //NOTE: Do not have access to this in the data-oblivious regime --- how
-            // do we get access?
-};
-
-        TrigSkOp(
-            TrigDistName family,
-            int64_t n_rows,
-            int64_t n_cols,
-            uint32_t key
-        ) : n_rows(n_rows), n_cols(n_cols), dist(TrigDist{n_rows, n_cols, family}), seed_state() {};
-
-        //TODO: Write a proper deconstructor
-        //Free up DiagonalRademacher && SampledRows
-        ~TrigSkOp();
-};
-
-template <typename T, typename RNG, SignedInteger sint_t>
-TrigSkOp<T, RNG, sint_t>::~TrigSkOp() {
-    delete [] this->DiagonalRademacher;
-    delete [] this->SampledRows;
-};
-
-template<typename T, typename RNG, SignedInteger sint_t>
-RandBLAS::RNGState<RNG> fill_trig(
-    TrigSkOp<T, RNG, sint_t> &Tr
-) {
-    /**
-     * Will do the work of filling in the diagonal Rademacher entries
-     * and selecting the rows/cols to be sampled
-     */
-    auto [ctr, key] = Tr.seed_state;
-
-    // Fill in the Rademacher diagonal
-    if(Tr.sketchFromLeft)
-        generateRademacherVector_r123(Tr.DiagonalRademacher, key[0], ctr[0], Tr.n_rows);
-    else
-        generateRademacherVector_r123(Tr.DiagonalRademacher, key[0], ctr[0], Tr.n_cols);
-
-    //NOTE: Select the rows/cols to be sampled --- use the `repeated_fisher_yates` function
-    int64_t r = Tr.dim_short;
-    int64_t d = Tr.dim_long;
-
-    std::vector<sint_t> idxs_minor(r); // Placeholder
-    std::vector<T> vals(r); // Placeholder
-
-    Tr.next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
-        Tr.seed_state,
-        r,         // Number of samples (vec_nnz)
-        d,         // Total number of elements (dim_major)
-        1,         // Single sample round (dim_minor)
-        Tr.SampledRows,  // Holds the required output
-        idxs_minor.data(),  // Placeholder
-        vals.data()         // Placeholder
-    );
-    Tr.known_filled = true;
-    return Tr.next_state;
-}
 }
 
 
 namespace RandBLAS::trig {
-
-// Performs the actual application of the fast trigonometric transform
-// Only called after calling `fill_trig`
-template <typename T, typename RNG>
-inline void lskget(
-    blas::Layout layout,
-    blas::Op opS,
-    blas::Op opA,
-    int64_t d, // B is d-by-n
-    int64_t n, // op(A) is m-by-n
-    int64_t m, // op(S) is d-by-m
-    T alpha,
-    TrigSkOp<T, RNG> &Tr,
-    int64_t ro_s,
-    int64_t co_s,
-    const T* A, // data-matrix
-    int64_t lda,
-    T beta,
-    T* B, // output matrix
-    int64_t ldb
-) {
-    if (!Tr.known_filled)
-        fill_trig(Tr);
-
-    // Applying the diagonal transformation
-    applyDiagonalRademacher(layout, opA, n, m, Tr.DiagonalRademacher, A, B);
-    // applyDiagonalRademacher(m, n, A, Tr.DiagonalRademacher);
-
-    //TODO: Apply the Hadamard transform
-
-    //... and finally permute the rows
-    // permuteRowsToTop(m, n, Tr.SampledRows, B, ldb);
-}
-
-
 /*
  * These functions apply an in-place, SRHT-like transform to the input matrix
  * i.e. A <- (\Pi H D)A OR A <- A(D H \Pi) (which is equivalent to A <- A(\Pi H D)^{-1})
@@ -593,7 +425,7 @@ inline void lmiget(
     std::vector<sint_t> idxs_minor(d); // Placeholder
     std::vector<T> vals(d); // Placeholder
 
-        // Populating `selected_rows`
+    // Populating `selected_rows`
         //TODO: Do I return this at some point?
     RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
         random_state,
@@ -641,7 +473,7 @@ inline void rmiget(
     std::vector<sint_t> idxs_minor(d); // Placeholder
     std::vector<T> vals(d); // Placeholder
 
-        // Populating `selected_rows`
+    // Populating `selected_rows`
         //TODO: Do I return this at some point?
     RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
         random_state,

From b2aabf8ffd48624ae67cc9726634f1fb9f8b2616 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Sat, 12 Oct 2024 15:03:38 -0700
Subject: [PATCH 13/26] First review

---
 RandBLAS/trig_skops.hh | 126 +++++++++++------------------------------
 1 file changed, 34 insertions(+), 92 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 6461360..a07af5c 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -6,14 +6,12 @@
 #include <Random123/philox.h>
 #include <blas.hh>
 #include <lapack.hh>
-#include <omp.h>
 
 #include <iostream>
 #include <stdio.h>
 #include <stdexcept>
 #include <string>
 #include <tuple>
-#include <random>
 
 #include <math.h>
 #include <typeinfo>
@@ -27,69 +25,52 @@ namespace RandBLAS {
     /// WARNING: None of the following functions or overloads thereof are part of the
     /// public API
     ///
-    template<SignedInteger sint_t>
-    void generateRademacherVector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
-        typedef r123::Philox2x32 RNG;
-
-        // Use OpenMP to parallelize the Rademacher vector generation
-        #pragma omp parallel
-        {
-            // Each thread has its own RNG instance
-            RNG rng;
 
-            RNG::key_type k = {{key_seed + omp_get_thread_num()}}; // Unique key for each thread
+    // Generates a vector of Rademacher entries using the Random123 library
+    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    void generateRademacherVector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
+        RNG rng;
 
-            // Thread-local counter
-            RNG::ctr_type c;
+        typename RNG::ctr_type c;
+        typename RNG::key_type key = {{key_seed}};
 
-            // Parallel for loop
-            #pragma omp for
-            for (int i = 0; i < n; ++i) {
-                // Set the counter for each random number (unique per thread)
-                c[0] = ctr_seed + i; // Ensure the counter is unique across threads
+        // Sequential loop for generating Rademacher entries
+        for (int i = 0; i < n; ++i) {
+            // Set the counter for each random number
+            c[0] = ctr_seed + i;  // Ensure each counter is unique
 
-                // Generate a 2x32-bit random number using the Philox generator
-                RNG::ctr_type r = rng(c, k);
+            // Generate a 2x32-bit random number using the Philox generator
+            typename RNG::ctr_type r = rng(c, key);
 
-                // Convert the random number into a float in [0, 1) using u01fixedpt
-                float randValue = r123::u01fixedpt<float>(r.v[0]);
+            // Convert the random number into a float in [0, 1) using u01fixedpt
+            float randValue = r123::u01fixedpt<float>(r.v[0]);
 
-                // Convert the float into a Rademacher entry (-1 or 1)
-                buff[i] = randValue < 0.5 ? -1 : 1;
-            }
+            // Convert the float into a Rademacher entry (-1 or 1)
+            buff[i] = randValue < 0.5 ? -1 : 1;
         }
     }
 
-    std::vector<int64_t> generateRademacherVector_parallel(int64_t n) {
-        std::vector<int64_t> rademacherVec(n);
+    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    RandBLAS::RNGState<RNG> generateRademacherVector_r123(sint_t* buff, int64_t n, RandBLAS::RNGState<RNG> seed_state) {
+        RNG rng;
+        auto [ctr, key] = seed_state;
 
-        #pragma omp parallel
-        {
-            // Each thread gets its own random number generator
-            std::random_device rd;
-            std::mt19937 gen(rd());
-            std::bernoulli_distribution bernoulli(0.5);
+        for (int64_t i = 0; i < n; ++i) {
+            typename RNG::ctr_type r = rng(ctr, key);
 
-            #pragma omp for
-            for (int64_t i = 0; i < n; ++i) {
-                rademacherVec[i] = bernoulli(gen) ? 1 : -1;
-            }
-        }
+            float randValue = r123::u01fixedpt<float>(r.v[0]);
 
-        return rademacherVec;
-    }
+            buff[i] = randValue < 0.5 ? -1 : 1;
 
-    template<typename T>
-    void applyDiagonalRademacher(int64_t rows, int64_t cols, T* A) {
-        std::vector<int64_t> diag = generateRademacherVector_parallel(cols);
+            ctr.incr();
+        }
 
-        for(int col=0; col < cols; col++) {
-        if(diag[col] > 0)
-            continue;
-        blas::scal(rows, diag[col], &A[col * rows], 1);
-    }
+        // Return the updated RNGState (with the incremented counter)
+        return RandBLAS::RNGState<RNG> {ctr, key};
     }
 
+    // Catch-all method for applying the diagonal Rademacher
+    // entries in-place to an input matrix, `A`
     template<typename T, SignedInteger sint_t>
     void applyDiagonalRademacher(
                                 bool left,
@@ -98,8 +79,9 @@ namespace RandBLAS {
                                 int64_t cols,
                                 T* A,
                                 sint_t* diag
-                                )
-    {
+                                ) {
+        //TODO: Investigate better schemes for performing the scaling
+        //TODO: Move to `RandBLAS/util.hh`
         if(left && layout == blas::Layout::ColMajor) {
             for(int col=0; col < cols; col++) {
                 if(diag[col] > 0)
@@ -130,43 +112,6 @@ namespace RandBLAS {
         }
     }
 
-    // `applyDiagonalRademacher`should have a similar API as `sketch_general`
-    // Will be called inside of `lskget`
-    // B <- alpha * diag(Rademacher) * op(A) + beta * B
-    // `alpha`, `beta` aren't needed and shape(A) == shape(B)
-    // lda == ldb
-    template<typename T, SignedInteger sint_t>
-    void applyDiagonalRademacher(
-        blas::Layout layout,
-        blas::Op opA, // taken from `sketch_general/lskget`
-        int64_t n, // everything is the same size as `op(A)`: m-by-n
-        int64_t m,
-        // std::vector<int64_t> &diag,
-        sint_t* diag,
-        const T* A, // The data matrix that won't be modified
-        // int64_t lda,
-        T* B // The destination matrix
-        // int64_t ldb
-    ) {
-        // B <- alpha * diag(Rademacher) * op(A) + beta * B
-        // `alpha`, `beta` aren't needed
-        // && shape(A) == shape(B) && lda == ldb && shape({A | B}) = m-by-n
-        //NOTE: Use `layout` and `opA` for working with RowMajor data
-        int64_t lda = m;
-        int64_t ldb = m;
-
-        //NOTE: Should the copy be made inside of `applyDiagonalRademacher` or
-        // should it be inside of `lskget`?
-        lapack::lacpy(lapack::MatrixType::General, m, n, A, lda, B, ldb);
-
-        applyDiagonalRademacher(m, n, B, diag);
-    }
-
-
-    // `permuteRowsToTop` should just take in `B` as an input
-    // `B` will be passed in by the user to `sketch_general/lskget` and `permuteRowsToTop` will take in
-    // the `B` that has been modified by `applyRademacherDiagonal`
-    // Will also be called inside of `lskget`
     template<typename T, SignedInteger sint_t>
     void permuteRowsToTop(
                           blas::Layout layout,
@@ -176,8 +121,6 @@ namespace RandBLAS {
                           int64_t d, // size of `selectedRows`
                           T* A
                           ) {
-        //TODO: There should be a similar `permuteColsToLeft` for sketching
-        //from the right
         int64_t top = 0;  // Keeps track of the topmost unselected row
 
         if(layout == blas::Layout::ColMajor) {
@@ -404,8 +347,7 @@ inline void lmiget(
     int64_t n,
     int64_t d, // `d` is the number of rows that have to be permuted by `\Pi`
     T* A // data-matrix
-)
-{
+) {
     // Size of the Rademacher entries = |A_cols|
     //TODO: Change `diag` to float/doubles (same data type as the matrix)
     sint_t* diag = new sint_t[n];

From bc33741fe17f8dfa00353889386a708eaeb0be80 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Sat, 12 Oct 2024 17:10:12 -0700
Subject: [PATCH 14/26] Some stylistic cleanups

---
 RandBLAS/trig_skops.hh | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index a07af5c..8b83cee 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -28,7 +28,7 @@ namespace RandBLAS {
 
     // Generates a vector of Rademacher entries using the Random123 library
     template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
-    void generateRademacherVector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
+    void generate_rademacher_vector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
         RNG rng;
 
         typename RNG::ctr_type c;
@@ -43,24 +43,24 @@ namespace RandBLAS {
             typename RNG::ctr_type r = rng(c, key);
 
             // Convert the random number into a float in [0, 1) using u01fixedpt
-            float randValue = r123::u01fixedpt<float>(r.v[0]);
+            float rand_value = r123::u01fixedpt<float>(r.v[0]);
 
             // Convert the float into a Rademacher entry (-1 or 1)
-            buff[i] = randValue < 0.5 ? -1 : 1;
+            buff[i] = rand_value < 0.5 ? -1 : 1;
         }
     }
 
     template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
-    RandBLAS::RNGState<RNG> generateRademacherVector_r123(sint_t* buff, int64_t n, RandBLAS::RNGState<RNG> seed_state) {
+    RandBLAS::RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RandBLAS::RNGState<RNG> seed_state) {
         RNG rng;
         auto [ctr, key] = seed_state;
 
         for (int64_t i = 0; i < n; ++i) {
             typename RNG::ctr_type r = rng(ctr, key);
 
-            float randValue = r123::u01fixedpt<float>(r.v[0]);
+            float rand_value = r123::u01fixedpt<float>(r.v[0]);
 
-            buff[i] = randValue < 0.5 ? -1 : 1;
+            buff[i] = rand_value < 0.5 ? -1 : 1;
 
             ctr.incr();
         }
@@ -72,7 +72,7 @@ namespace RandBLAS {
     // Catch-all method for applying the diagonal Rademacher
     // entries in-place to an input matrix, `A`
     template<typename T, SignedInteger sint_t>
-    void applyDiagonalRademacher(
+    void apply_diagonal_rademacher(
                                 bool left,
                                 blas::Layout layout,
                                 int64_t rows,
@@ -117,18 +117,18 @@ namespace RandBLAS {
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
-                          sint_t* selectedRows,
-                          int64_t d, // size of `selectedRows`
+                          sint_t* selected_rows,
+                          int64_t d, // size of `selected_rows`
                           T* A
                           ) {
         int64_t top = 0;  // Keeps track of the topmost unselected row
 
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
-                if (selectedRows[i] != top) {
+                if (selected_rows[i] != top) {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
-                    blas::swap(cols, &A[selectedRows[i]], rows, &A[top], rows);
+                    blas::swap(cols, &A[selected_rows[i]], rows, &A[top], rows);
                 }
                 top++;
             }
@@ -136,8 +136,8 @@ namespace RandBLAS {
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
-                if (selectedRows[i] != top) {
-                    blas::swap(cols, &A[cols * selectedRows[i]], 1, &A[cols * top], 1);
+                if (selected_rows[i] != top) {
+                    blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
                 }
                 top++;
             }
@@ -149,7 +149,7 @@ namespace RandBLAS {
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
-                          sint_t* selectedCols,
+                          sint_t* selected_cols,
                           int64_t d, // size of `selectedRows`
                           T* A
                           ) {
@@ -157,10 +157,10 @@ namespace RandBLAS {
 
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
-                if (selectedCols[i] != left) {
+                if (selected_cols[i] != left) {
                     // Use BLAS::swap to swap entire columns at once
                     // Swapping col 'selected' with col 'top'
-                    blas::swap(rows, &A[rows * selectedCols[i]], 1, &A[rows * left], 1);
+                    blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
                 }
                 left++;
             }
@@ -168,8 +168,8 @@ namespace RandBLAS {
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
-                if (selectedCols[i] != left) {
-                    blas::swap(rows, &A[selectedCols[i]], cols, &A[left], cols);
+                if (selected_cols[i] != left) {
+                    blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
                 left++;
             }
@@ -357,8 +357,8 @@ inline void lmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generateRademacherVector_r123(diag, key[0], ctr[0], n);
-    applyDiagonalRademacher(true, layout, m, n, A, diag);
+    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    apply_diagonal_rademacher(true, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
     fht_dispatch(true, layout, A, std::log2(MAX(m, n)), m, n);
@@ -405,8 +405,8 @@ inline void rmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generateRademacherVector_r123(diag, key[0], ctr[0], n);
-    applyDiagonalRademacher(false, layout, m, n, A, diag);
+    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    apply_diagonal_rademacher(false, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
     fht_dispatch(false, layout, A, std::log2(MAX(m, n)), m, n);

From 348bf050ee5ca1c39bfd4bce09922781f9260bda Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Sun, 13 Oct 2024 10:24:24 -0700
Subject: [PATCH 15/26] Templating numeric precision in FHT implementation

---
 RandBLAS/trig_skops.hh | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 8b83cee..83fd565 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -176,7 +176,8 @@ namespace RandBLAS {
         }
     }
 
-    void fht_left_col_major(double *buf, int log_n, int num_rows, int num_cols) {
+    template <typename T>
+    void fht_left_col_major(T *buf, int log_n, int num_rows, int num_cols) {
         int n = 1 << log_n;
 
         // Apply FHT to each column independently
@@ -211,7 +212,8 @@ namespace RandBLAS {
         }
     }
 
-    void fht_left_row_major(double *buf, int log_n, int num_rows, int num_cols) {
+    template <typename T>
+    void fht_left_row_major(T *buf, int log_n, int num_rows, int num_cols) {
         int n = 1 << log_n;
 
         // Apply FHT to each column independently
@@ -243,7 +245,8 @@ namespace RandBLAS {
         }
     }
 
-    void fht_right_row_major(double *buf, int log_n, int num_rows, int num_cols) {
+    template <typename T>
+    void fht_right_row_major(T *buf, int log_n, int num_rows, int num_cols) {
         int n = 1 << log_n;
 
         // Apply FHT to each row independently
@@ -278,7 +281,8 @@ namespace RandBLAS {
         }
     }
 
-    void fht_right_col_major(double *buf, int log_n, int num_rows, int num_cols) {
+    template <typename T>
+    void fht_right_col_major(T *buf, int log_n, int num_rows, int num_cols) {
         int n = 1 << log_n;
 
         // Apply FHT to each row independently
@@ -310,10 +314,11 @@ namespace RandBLAS {
         }
     }
 
+    template <typename T>
     void fht_dispatch(
         bool left,
         blas::Layout layout,
-        double* buff,
+        T* buff,
         int64_t log_n,
         int64_t num_rows,
         int64_t num_cols

From f13086b26f4925a9aa761a98e37debb62721be5e Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Sun, 13 Oct 2024 13:48:02 -0700
Subject: [PATCH 16/26] Corrected ommited templating + `int->int64_t`

---
 RandBLAS/trig_skops.hh | 94 +++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 83fd565..4447205 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -35,7 +35,7 @@ namespace RandBLAS {
         typename RNG::key_type key = {{key_seed}};
 
         // Sequential loop for generating Rademacher entries
-        for (int i = 0; i < n; ++i) {
+        for (int64_t i = 0; i < n; ++i) {
             // Set the counter for each random number
             c[0] = ctr_seed + i;  // Ensure each counter is unique
 
@@ -83,28 +83,28 @@ namespace RandBLAS {
         //TODO: Investigate better schemes for performing the scaling
         //TODO: Move to `RandBLAS/util.hh`
         if(left && layout == blas::Layout::ColMajor) {
-            for(int col=0; col < cols; col++) {
+            for(int64_t col=0; col < cols; col++) {
                 if(diag[col] > 0)
                     continue;
                 blas::scal(rows, diag[col], &A[col * rows], 1);
             }
         }
         else if(left && layout == blas::Layout::RowMajor) {
-            for(int col=0; col < cols; col++) {
+            for(int64_t col=0; col < cols; col++) {
                 if(diag[col] > 0)
                     continue;
                 blas::scal(rows, diag[col], &A[col], cols);
             }
         }
         else if(!left && layout == blas::Layout::ColMajor) {
-            for(int row = 0; row < rows; row++) {
+            for(int64_t row = 0; row < rows; row++) {
                 if(diag[row] > 0)
                     continue;
                 blas::scal(cols, diag[row], &A[row], rows);
             }
         }
         else {
-            for(int row = 0; row < rows; row++) {
+            for(int64_t row = 0; row < rows; row++) {
                 if(diag[row] > 0)
                     continue;
                 blas::scal(cols, diag[row], &A[row * cols], 1);
@@ -177,26 +177,26 @@ namespace RandBLAS {
     }
 
     template <typename T>
-    void fht_left_col_major(T *buf, int log_n, int num_rows, int num_cols) {
-        int n = 1 << log_n;
+    void fht_left_col_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
 
         // Apply FHT to each column independently
-        for (int col = 0; col < num_cols; ++col) {
+        for (int64_t col = 0; col < num_cols; ++col) {
             // Pointer to the beginning of the current column in the Column-Major order
-            double* col_buf = buf + col * num_rows;
+            T* col_buf = buf + col * num_rows;
 
             // Apply the original FHT on this column
-            for (int i = 0; i < log_n; ++i) {
-                int s1 = 1 << i;
-                int s2 = s1 << 1;
-                for (int j = 0; j < n; j += s2) {
-                    for (int k = 0; k < s1; ++k) {
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
                         bool b1 = j + k < num_rows;
                         bool b2 = j + k + s1 < num_rows;
-                        double u = b1 ? col_buf[j + k] : 0;
-                        double v = b2 ? col_buf[j + k + s1] : 0;
+                        T u = b1 ? col_buf[j + k] : 0;
+                        T v = b2 ? col_buf[j + k + s1] : 0;
                         if(b1 && b2) {
                             col_buf[j + k] = u + v;
                             col_buf[j + k + s1] = u - v;
@@ -213,23 +213,23 @@ namespace RandBLAS {
     }
 
     template <typename T>
-    void fht_left_row_major(T *buf, int log_n, int num_rows, int num_cols) {
-        int n = 1 << log_n;
+    void fht_left_row_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
 
         // Apply FHT to each column independently
-        for (int col = 0; col < num_cols; ++col) {
+        for (int64_t col = 0; col < num_cols; ++col) {
             // Apply the original FHT on this column
-            for (int i = 0; i < log_n; ++i) {
-                int s1 = 1 << i;
-                int s2 = s1 << 1;
-                for (int j = 0; j < n; j += s2) {
-                    for (int k = 0; k < s1; ++k) {
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
                         bool b1 = j + k < num_rows;
                         bool b2 = j + k + s1 < num_rows;
-                        double u = b1 ? buf[(j + k) * num_cols + col] : 0;
-                        double v = b2 ? buf[(j + k + s1) * num_cols + col] : 0;
+                        T u = b1 ? buf[(j + k) * num_cols + col] : 0;
+                        T v = b2 ? buf[(j + k + s1) * num_cols + col] : 0;
                         if(b1 && b2) {
                             buf[(j + k) * num_cols + col] = u + v;
                             buf[(j + k + s1) * num_cols + col] = u - v;
@@ -246,26 +246,26 @@ namespace RandBLAS {
     }
 
     template <typename T>
-    void fht_right_row_major(T *buf, int log_n, int num_rows, int num_cols) {
-        int n = 1 << log_n;
+    void fht_right_row_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
 
         // Apply FHT to each row independently
-        for (int row = 0; row < num_rows; ++row) {
+        for (int64_t row = 0; row < num_rows; ++row) {
             // Pointer to the beginning of the current row in RowMajor order
-            double* row_buf = buf + row * num_cols;
+            T * row_buf = buf + row * num_cols;
 
             // Apply the original FHT on this row
-            for (int i = 0; i < log_n; ++i) {
-                int s1 = 1 << i;
-                int s2 = s1 << 1;
-                for (int j = 0; j < n; j += s2) {
-                    for (int k = 0; k < s1; ++k) {
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
                         bool b1 = j + k < num_cols;
                         bool b2 = j + k + s1 < num_cols;
-                        double u = b1 ? row_buf[j + k] : 0;
-                        double v = b2 ? row_buf[j + k + s1] : 0;
+                        T u = b1 ? row_buf[j + k] : 0;
+                        T v = b2 ? row_buf[j + k + s1] : 0;
                         if(b1 && b2) {
                             row_buf[j + k] = u + v;
                             row_buf[j + k + s1] = u - v;
@@ -282,23 +282,23 @@ namespace RandBLAS {
     }
 
     template <typename T>
-    void fht_right_col_major(T *buf, int log_n, int num_rows, int num_cols) {
-        int n = 1 << log_n;
+    void fht_right_col_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
 
         // Apply FHT to each row independently
-        for (int row= 0; row < num_rows; ++row) {
+        for (int64_t row= 0; row < num_rows; ++row) {
             // Apply the original FHT on this column
-            for (int i = 0; i < log_n; ++i) {
-                int s1 = 1 << i;
-                int s2 = s1 << 1;
-                for (int j = 0; j < n; j += s2) {
-                    for (int k = 0; k < s1; ++k) {
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
                         bool b1 = j + k < num_cols;
                         bool b2 = j + k + s1 < num_cols;
-                        double u = b1 ? buf[(j + k) * num_rows + row] : 0;
-                        double v = b2 ? buf[(j + k + s1) * num_rows + row] : 0;
+                        T u = b1 ? buf[(j + k) * num_rows + row] : 0;
+                        T v = b2 ? buf[(j + k + s1) * num_rows + row] : 0;
                         if(b1 && b2) {
                             buf[(j + k) * num_rows + row] = u + v;
                             buf[(j + k + s1) * num_rows + row] = u - v;

From 884f96e60c3bfb9984452dc499decb5130d9308b Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 16 Oct 2024 11:37:32 -0700
Subject: [PATCH 17/26] First version of tests, VERY UNSTABLE

---
 RandBLAS/trig_skops.hh              |  19 +-
 test/test_matmul_cores/test_trig.cc | 354 ++++++++++++++++++++++++++++
 2 files changed, 367 insertions(+), 6 deletions(-)
 create mode 100644 test/test_matmul_cores/test_trig.cc

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 4447205..6fba319 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -5,7 +5,6 @@
 
 #include <Random123/philox.h>
 #include <blas.hh>
-#include <lapack.hh>
 
 #include <iostream>
 #include <stdio.h>
@@ -125,21 +124,25 @@ namespace RandBLAS {
 
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_rows[i] == top,
+                                    "The list of provided indices should be unique");
                 if (selected_rows[i] != top) {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
-                    blas::swap(cols, &A[selected_rows[i]], rows, &A[top], rows);
+                    blas::swap(cols, &A[top], rows, &A[selected_rows[i]], rows);
+                    // top = selected_rows[i];
                 }
-                top++;
             }
         }
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_rows[i] == top,
+                                    "The list of provided indices should be unique");
                 if (selected_rows[i] != top) {
                     blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
+                    // top = selected_rows[i];
                 }
-                top++;
             }
         }
     }
@@ -157,21 +160,25 @@ namespace RandBLAS {
 
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_cols[i] == left,
+                                    "The list of provided indices should be unique");
                 if (selected_cols[i] != left) {
                     // Use BLAS::swap to swap entire columns at once
                     // Swapping col 'selected' with col 'top'
                     blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
                 }
-                left++;
+                // left++;
             }
         }
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_cols[i] == left,
+                                    "The list of provided indices should be unique");
                 if (selected_cols[i] != left) {
                     blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
-                left++;
+                // left++;
             }
         }
     }
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
new file mode 100644
index 0000000..841de18
--- /dev/null
+++ b/test/test_matmul_cores/test_trig.cc
@@ -0,0 +1,354 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+// #include "test/test_matmul_cores/linop_common.hh"
+#include "RandBLAS.hh"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/trig_skops.hh"
+#include <blas.hh>
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <cmath>
+#include <gtest/gtest.h>
+
+using RandBLAS::trig::lmiget;
+using RandBLAS::trig::rmiget;
+using RandBLAS::generate_rademacher_vector_r123;
+using RandBLAS::apply_diagonal_rademacher;
+using RandBLAS::permuteRowsToTop;
+using RandBLAS::permuteColsToLeft;
+using RandBLAS::fht_dispatch;
+using Eigen::MatrixXd;
+
+
+class TestLMIGET : public::testing::Test
+{
+    protected:
+    virtual void SetUp(){};
+
+    virtual void TearDown(){};
+
+    inline static std::vector<uint32_t> keys {0, 42};
+
+    // Helper function for explicitly generating a Hadamard matrix
+    // (Note that ColMajor and RowMajor storage is identical for `H`)
+    static std::vector<double> generate_hadamard(int64_t log_n) {
+        int64_t size = 1 << log_n;  // size = 2^n
+        std::vector<std::vector<double>> H(size, std::vector<double>(size, 1));  // Initialize H_1
+
+        // Sylvester's construction: recursively build the matrix
+        for (int n = 1; n <= log_n; ++n) {
+            double curr_size = 1 << n;  // Current size of the matrix is 2^n
+            for (int i = 0; i < curr_size / 2; ++i) {
+                for (int j = 0; j < curr_size / 2; ++j) {
+                    // Fill the bottom-left and bottom-right quadrants
+                    H[i + curr_size / 2][j] = H[i][j];       // Copy the top-left quadrant to bottom-left
+                    H[i][j + curr_size / 2] = H[i][j];       // Copy the top-left quadrant to top-right
+                    H[i + curr_size / 2][j + curr_size / 2] = -H[i][j]; // Fill bottom-right with negative values
+                }
+            }
+        }
+
+        // Flatten into a vector in ColMajor order
+        std::vector<double> H_flat(size * size);
+
+        for (int col = 0; col < size; ++col) {
+            for (int row = 0; row < size; ++row) {
+                H_flat[col * size + row] = H[row][col];
+            }
+        }
+
+        return H_flat;
+    }
+
+    enum class transforms {diag_scale, hadamard, permute};
+
+    // Tests to verify correctness of each of the transforms
+    template <typename T, RandBLAS::SignedInteger sint_t = int64_t>
+    static void correctness(
+        uint32_t seed,
+        transforms transform,
+        int64_t m, // Generated data matrix, `A` is of size `(m x n)`
+        int64_t n,
+        int64_t d, // The permutation matrix permutes `d` of the final rows/cols
+        bool left,
+        blas::Layout layout,
+        double epsilon=1e-5
+    ) {
+        // Grabbing a random matrix
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A(m, n);
+        A.setRandom();
+
+        // Deep copy
+        MatrixXd B = A;
+
+        switch (transform) {
+        case transforms::permute: {
+            // Simply compares against Eigen::PermutationMatrix
+            Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic> perm(5);
+
+            // Define the indices of the permutation manually
+            // Eigen::VectorXi indices = left ? Eigen::VectorXi(m) : Eigen::VectorXi(n);
+
+            std::vector<int> V = left ? std::vector<int>(m) : std::vector<int>(n);
+
+            int cnt = 0;
+            // int cnt = 0;
+            for(int i = 0; i < V.size(); i++) {
+                if(i == 0)
+                V[i] = V.size() - 1;
+                else if(i == V.size() - 1)
+                V[i] = 0;
+                else
+                V[i] = cnt;
+                cnt++;
+            }
+
+            Eigen::VectorXi indices = Eigen::Map<Eigen::VectorXi>(V.data(), V.size());
+
+            // Set the indices in the permutation matrix
+            perm.indices() = indices;
+
+            // std::reverse(V.begin(), V.end());
+            sint_t* v = new sint_t;
+            *v = V.size() - 1;
+            if(left)
+                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A.data());
+            else
+                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A.data());
+
+            std::cout << "orig" << std::endl << B << std::endl;
+            std::cout << "eigen_permute" << std::endl << perm * B << std::endl;
+            std::cout << "I permute" << std::endl << A << std::endl;
+            // Or just do A.isApprox(B)
+            double norm_permute = 0.0;
+            if(left)
+            norm_permute = (A - perm * B).norm();
+            else {
+            norm_permute = (A - B * perm).norm();
+            std::cout << norm_permute << std::endl;
+            }
+
+            // Or do A.isApprox(H * B)
+            randblas_require(norm_permute < epsilon);
+
+            break;
+        }
+        case transforms::hadamard: {
+            // Here, simply check against explicit application of the Hadamard matrix
+            RandBLAS::fht_dispatch(left, layout, A.data(), std::log2(m), m, n);
+            std::vector<double> H_vec = generate_hadamard(std::log2(m));
+            //TODO: Should have a check here to enforce that `m` is a power of 2 (since
+            // my `generate_hadamard` function does not take care to pad an input matrix)
+            MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(m))), int(std::pow(2, std::log2(m))));
+
+            double norm_hadamard = (A - H * B).norm();
+            randblas_require(norm_hadamard < epsilon);
+
+            break;
+        }
+        case transforms::diag_scale: {
+            // Scales all rows/cols by -1 and checks if A == -A
+            std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
+
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A.data(), buff.data());
+
+            double norm_diag = (A + B).norm();
+            randblas_require(norm_diag < epsilon);
+
+            break;
+        }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+//
+//      Checking correctness of each of the transforms
+//
+//
+////////////////////////////////////////////////////////////////////////
+
+TEST_F(TestLMIGET, test_diag_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            0,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            0,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            0,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            0,
+            false,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            0,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            0,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            0,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            0,
+            false,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            128,
+            100,
+            0,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            100,
+            128,
+            0,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            128,
+            100,
+            0,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            100,
+            128,
+            0,
+            false,
+            blas::Layout::RowMajor
+        );
+}

From 018acd83df2b61a5a8a98050488bed403232500c Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 16 Oct 2024 17:05:31 -0700
Subject: [PATCH 18/26] Hadamard && Diag tests working

---
 RandBLAS/trig_skops.hh              |  10 ++-
 test/test_matmul_cores/test_trig.cc | 107 +++++++++++++++++++++-------
 2 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 6fba319..d0c7f31 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -122,10 +122,11 @@ namespace RandBLAS {
                           ) {
         int64_t top = 0;  // Keeps track of the topmost unselected row
 
+        //TODO: discuss precise semantics of `selected_rows` in this function
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
                 randblas_error_if_msg(selected_rows[i] == top,
-                                    "The list of provided indices should be unique");
+                                      "The list of provided indices should be unique");
                 if (selected_rows[i] != top) {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
@@ -138,7 +139,8 @@ namespace RandBLAS {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
                 randblas_error_if_msg(selected_rows[i] == top,
-                                    "The list of provided indices should be unique");
+                                      "The list of provided indices should be unique");
+                std::cout << "see here" << selected_rows[i] << std::endl;
                 if (selected_rows[i] != top) {
                     blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
                     // top = selected_rows[i];
@@ -160,8 +162,6 @@ namespace RandBLAS {
 
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
-                randblas_error_if_msg(selected_cols[i] == left,
-                                    "The list of provided indices should be unique");
                 if (selected_cols[i] != left) {
                     // Use BLAS::swap to swap entire columns at once
                     // Swapping col 'selected' with col 'top'
@@ -173,8 +173,6 @@ namespace RandBLAS {
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
-                randblas_error_if_msg(selected_cols[i] == left,
-                                    "The list of provided indices should be unique");
                 if (selected_cols[i] != left) {
                     blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 841de18..321b33c 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -27,7 +27,6 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 
-// #include "test/test_matmul_cores/linop_common.hh"
 #include "RandBLAS.hh"
 #include "RandBLAS/base.hh"
 #include "RandBLAS/trig_skops.hh"
@@ -36,6 +35,7 @@
 #include <Eigen/Core>
 #include <Eigen/Dense>
 #include <cmath>
+#include <random>
 #include <gtest/gtest.h>
 
 using RandBLAS::trig::lmiget;
@@ -88,6 +88,25 @@ class TestLMIGET : public::testing::Test
         return H_flat;
     }
 
+    static std::vector<double> generate_random_vector(int size, double lower_bound, double upper_bound) {
+        // Create a random device and seed the random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+
+        // Define the distribution range for the random doubles
+        std::uniform_real_distribution<> dist(lower_bound, upper_bound);
+
+        // Create a vector of the specified size
+        std::vector<double> random_vector(size);
+
+        // Generate random doubles and fill the vector
+        for (int i = 0; i < size; ++i) {
+            random_vector[i] = dist(gen);
+        }
+
+        return random_vector;
+    }
+
     enum class transforms {diag_scale, hadamard, permute};
 
     // Tests to verify correctness of each of the transforms
@@ -103,24 +122,25 @@ class TestLMIGET : public::testing::Test
         double epsilon=1e-5
     ) {
         // Grabbing a random matrix
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A(m, n);
-        A.setRandom();
+        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
 
         // Deep copy
-        MatrixXd B = A;
+        MatrixXd B;
+        if(layout == blas::Layout::RowMajor)
+        B = A_row;
+        else
+        B = A_col;
 
         switch (transform) {
         case transforms::permute: {
             // Simply compares against Eigen::PermutationMatrix
             Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic> perm(5);
 
-            // Define the indices of the permutation manually
-            // Eigen::VectorXi indices = left ? Eigen::VectorXi(m) : Eigen::VectorXi(n);
-
             std::vector<int> V = left ? std::vector<int>(m) : std::vector<int>(n);
 
             int cnt = 0;
-            // int cnt = 0;
             for(int i = 0; i < V.size(); i++) {
                 if(i == 0)
                 V[i] = V.size() - 1;
@@ -136,24 +156,34 @@ class TestLMIGET : public::testing::Test
             // Set the indices in the permutation matrix
             perm.indices() = indices;
 
-            // std::reverse(V.begin(), V.end());
             sint_t* v = new sint_t;
             *v = V.size() - 1;
-            if(left)
-                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A.data());
-            else
-                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A.data());
+            if(left) {
+                if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_row.data());
+                else
+                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_col.data());
+            }
+            else {
+                if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_row.data());
+                else
+                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_col.data());
+            }
 
-            std::cout << "orig" << std::endl << B << std::endl;
-            std::cout << "eigen_permute" << std::endl << perm * B << std::endl;
-            std::cout << "I permute" << std::endl << A << std::endl;
             // Or just do A.isApprox(B)
             double norm_permute = 0.0;
-            if(left)
-            norm_permute = (A - perm * B).norm();
+            if(left) {
+                if(layout == blas::Layout::RowMajor)
+                norm_permute = (A_row - perm * B).norm();
+                else
+                norm_permute = (A_col - perm * B).norm();
+            }
             else {
-            norm_permute = (A - B * perm).norm();
-            std::cout << norm_permute << std::endl;
+            if(layout == blas::Layout::RowMajor)
+                norm_permute = (A_row - B * perm).norm();
+            else
+                norm_permute = (A_row - B * perm).norm();
             }
 
             // Or do A.isApprox(H * B)
@@ -163,13 +193,29 @@ class TestLMIGET : public::testing::Test
         }
         case transforms::hadamard: {
             // Here, simply check against explicit application of the Hadamard matrix
-            RandBLAS::fht_dispatch(left, layout, A.data(), std::log2(m), m, n);
-            std::vector<double> H_vec = generate_hadamard(std::log2(m));
-            //TODO: Should have a check here to enforce that `m` is a power of 2 (since
+            int ld = (left) ? m : n;
+            if(layout == blas::Layout::ColMajor)
+            RandBLAS::fht_dispatch(left, layout, A_col.data(), std::log2(ld), m, n);
+            else
+            RandBLAS::fht_dispatch(left, layout, A_row.data(), std::log2(ld), m, n);
+
+            std::vector<double> H_vec = generate_hadamard(std::log2(ld));
+            //TODO: Should have a check here to enforce that `m` and `n` are powers of 2 (since
             // my `generate_hadamard` function does not take care to pad an input matrix)
-            MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(m))), int(std::pow(2, std::log2(m))));
+            MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(ld))), int(std::pow(2, std::log2(ld))));
+
+            double norm_hadamard = 0.0;
+            if(left)
+            if(layout == blas::Layout::RowMajor)
+                norm_hadamard = (A_row - H * B).norm();
+            else
+                norm_hadamard = (A_col - H * B).norm();
+            else
+            if(layout == blas::Layout::RowMajor)
+                norm_hadamard = (A_row - B * H).norm();
+            else
+                norm_hadamard = (A_col - B * H).norm();
 
-            double norm_hadamard = (A - H * B).norm();
             randblas_require(norm_hadamard < epsilon);
 
             break;
@@ -178,9 +224,16 @@ class TestLMIGET : public::testing::Test
             // Scales all rows/cols by -1 and checks if A == -A
             std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
 
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A.data(), buff.data());
+            double norm_diag = 0.0;
+            if(layout == blas::Layout::RowMajor) {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+            double norm_diag = (A_row + B).norm();
+            }
+            else {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+            double norm_diag = (A_col + B).norm();
+            }
 
-            double norm_diag = (A + B).norm();
             randblas_require(norm_diag < epsilon);
 
             break;

From 0cfdcf1007732b658c67a97feefbd73d8e2216d3 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 17 Oct 2024 02:31:49 -0700
Subject: [PATCH 19/26] Final cleanups, all tests working

---
 RandBLAS/trig_skops.hh              |  5 ---
 test/test_matmul_cores/test_trig.cc | 50 ++++++++++++-----------------
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index d0c7f31..804fe89 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -131,7 +131,6 @@ namespace RandBLAS {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
                     blas::swap(cols, &A[top], rows, &A[selected_rows[i]], rows);
-                    // top = selected_rows[i];
                 }
             }
         }
@@ -140,10 +139,8 @@ namespace RandBLAS {
             for (int64_t i=0; i < d; i++) {
                 randblas_error_if_msg(selected_rows[i] == top,
                                       "The list of provided indices should be unique");
-                std::cout << "see here" << selected_rows[i] << std::endl;
                 if (selected_rows[i] != top) {
                     blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
-                    // top = selected_rows[i];
                 }
             }
         }
@@ -167,7 +164,6 @@ namespace RandBLAS {
                     // Swapping col 'selected' with col 'top'
                     blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
                 }
-                // left++;
             }
         }
         else {
@@ -176,7 +172,6 @@ namespace RandBLAS {
                 if (selected_cols[i] != left) {
                     blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
-                // left++;
             }
         }
     }
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 321b33c..fde958c 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -116,7 +116,6 @@ class TestLMIGET : public::testing::Test
         transforms transform,
         int64_t m, // Generated data matrix, `A` is of size `(m x n)`
         int64_t n,
-        int64_t d, // The permutation matrix permutes `d` of the final rows/cols
         bool left,
         blas::Layout layout,
         double epsilon=1e-5
@@ -129,9 +128,9 @@ class TestLMIGET : public::testing::Test
         // Deep copy
         MatrixXd B;
         if(layout == blas::Layout::RowMajor)
-        B = A_row;
+            B = A_row;
         else
-        B = A_col;
+            B = A_col;
 
         switch (transform) {
         case transforms::permute: {
@@ -141,13 +140,14 @@ class TestLMIGET : public::testing::Test
             std::vector<int> V = left ? std::vector<int>(m) : std::vector<int>(n);
 
             int cnt = 0;
+            // int cnt = 0;
             for(int i = 0; i < V.size(); i++) {
                 if(i == 0)
-                V[i] = V.size() - 1;
+                    V[i] = V.size() - 1;
                 else if(i == V.size() - 1)
-                V[i] = 0;
+                    V[i] = 0;
                 else
-                V[i] = cnt;
+                    V[i] = cnt;
                 cnt++;
             }
 
@@ -158,32 +158,33 @@ class TestLMIGET : public::testing::Test
 
             sint_t* v = new sint_t;
             *v = V.size() - 1;
+
             if(left) {
                 if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_row.data());
+                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_row.data());
                 else
-                RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_col.data());
+                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_col.data());
             }
             else {
                 if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_row.data());
+                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_row.data());
                 else
-                RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_col.data());
+                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_col.data());
             }
 
             // Or just do A.isApprox(B)
             double norm_permute = 0.0;
             if(left) {
                 if(layout == blas::Layout::RowMajor)
-                norm_permute = (A_row - perm * B).norm();
+                    norm_permute = (A_row - perm * B).norm();
                 else
-                norm_permute = (A_col - perm * B).norm();
+                    norm_permute = (A_col - perm * B).norm();
             }
             else {
             if(layout == blas::Layout::RowMajor)
                 norm_permute = (A_row - B * perm).norm();
             else
-                norm_permute = (A_row - B * perm).norm();
+                norm_permute = (A_col - B * perm).norm();
             }
 
             // Or do A.isApprox(H * B)
@@ -193,11 +194,12 @@ class TestLMIGET : public::testing::Test
         }
         case transforms::hadamard: {
             // Here, simply check against explicit application of the Hadamard matrix
+
             int ld = (left) ? m : n;
             if(layout == blas::Layout::ColMajor)
-            RandBLAS::fht_dispatch(left, layout, A_col.data(), std::log2(ld), m, n);
+                RandBLAS::fht_dispatch(left, layout, A_col.data(), std::log2(ld), m, n);
             else
-            RandBLAS::fht_dispatch(left, layout, A_row.data(), std::log2(ld), m, n);
+                RandBLAS::fht_dispatch(left, layout, A_row.data(), std::log2(ld), m, n);
 
             std::vector<double> H_vec = generate_hadamard(std::log2(ld));
             //TODO: Should have a check here to enforce that `m` and `n` are powers of 2 (since
@@ -205,16 +207,18 @@ class TestLMIGET : public::testing::Test
             MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(ld))), int(std::pow(2, std::log2(ld))));
 
             double norm_hadamard = 0.0;
-            if(left)
+            if(left) {
             if(layout == blas::Layout::RowMajor)
                 norm_hadamard = (A_row - H * B).norm();
             else
                 norm_hadamard = (A_col - H * B).norm();
-            else
+            }
+            else {
             if(layout == blas::Layout::RowMajor)
                 norm_hadamard = (A_row - B * H).norm();
             else
                 norm_hadamard = (A_col - B * H).norm();
+            }
 
             randblas_require(norm_hadamard < epsilon);
 
@@ -257,7 +261,6 @@ TEST_F(TestLMIGET, test_diag_left_colmajor) {
             transforms::diag_scale,
             100,
             100,
-            0,
             true,
             blas::Layout::ColMajor
         );
@@ -270,7 +273,6 @@ TEST_F(TestLMIGET, test_diag_right_colmajor) {
             transforms::diag_scale,
             100,
             100,
-            0,
             false,
             blas::Layout::ColMajor
         );
@@ -283,7 +285,6 @@ TEST_F(TestLMIGET, test_diag_left_rowmajor) {
             transforms::diag_scale,
             100,
             100,
-            0,
             true,
             blas::Layout::RowMajor
         );
@@ -296,7 +297,6 @@ TEST_F(TestLMIGET, test_diag_right_rowmajor) {
             transforms::diag_scale,
             100,
             100,
-            0,
             false,
             blas::Layout::RowMajor
         );
@@ -309,7 +309,6 @@ TEST_F(TestLMIGET, test_permute_left_colmajor) {
             transforms::permute,
             100,
             100,
-            0,
             true,
             blas::Layout::ColMajor
         );
@@ -322,7 +321,6 @@ TEST_F(TestLMIGET, test_permute_right_colmajor) {
             transforms::permute,
             100,
             100,
-            0,
             false,
             blas::Layout::ColMajor
         );
@@ -335,7 +333,6 @@ TEST_F(TestLMIGET, test_permute_left_rowmajor) {
             transforms::permute,
             100,
             100,
-            0,
             true,
             blas::Layout::RowMajor
         );
@@ -348,7 +345,6 @@ TEST_F(TestLMIGET, test_permute_right_rowmajor) {
             transforms::permute,
             100,
             100,
-            0,
             false,
             blas::Layout::RowMajor
         );
@@ -361,7 +357,6 @@ TEST_F(TestLMIGET, test_hadamard_left_colmajor) {
             transforms::hadamard,
             128,
             100,
-            0,
             true,
             blas::Layout::ColMajor
         );
@@ -374,7 +369,6 @@ TEST_F(TestLMIGET, test_hadamard_right_colmajor) {
             transforms::hadamard,
             100,
             128,
-            0,
             false,
             blas::Layout::ColMajor
         );
@@ -387,7 +381,6 @@ TEST_F(TestLMIGET, test_hadamard_left_rowmajor) {
             transforms::hadamard,
             128,
             100,
-            0,
             true,
             blas::Layout::RowMajor
         );
@@ -400,7 +393,6 @@ TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
             transforms::hadamard,
             100,
             128,
-            0,
             false,
             blas::Layout::RowMajor
         );

From 591522f0b3b6b34921bef27466b34841086331d5 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 17 Oct 2024 06:21:30 -0700
Subject: [PATCH 20/26] Added in invSRHT tests

---
 test/test_matmul_cores/test_trig.cc | 163 +++++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 2 deletions(-)

diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index fde958c..01b8913 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -231,11 +231,11 @@ class TestLMIGET : public::testing::Test
             double norm_diag = 0.0;
             if(layout == blas::Layout::RowMajor) {
             RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
-            double norm_diag = (A_row + B).norm();
+            norm_diag = (A_row + B).norm();
             }
             else {
             RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
-            double norm_diag = (A_col + B).norm();
+            norm_diag = (A_col + B).norm();
             }
 
             randblas_require(norm_diag < epsilon);
@@ -244,6 +244,117 @@ class TestLMIGET : public::testing::Test
         }
         }
     }
+
+    template <typename T, RandBLAS::SignedInteger sint_t = int64_t>
+    static void inverse_transform(
+        uint32_t seed,
+        int64_t m, // Generated data matrix, `A` is of size `(m x n)`
+        int64_t n,
+        int64_t d,
+        bool left,
+        blas::Layout layout,
+        double epsilon=1e-5
+    ) {
+        // Grabbing a random matrix
+        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        // Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col(m, n);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        // A_col.setRandom();
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
+
+        // Deep copy
+        MatrixXd B;
+        if(layout == blas::Layout::RowMajor)
+        B = A_row;
+        else
+        B = A_col;
+
+        //// Performing \Pi H D
+        // Step 1: setup the diagonal scaling
+        std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
+
+        if(layout == blas::Layout::RowMajor) {
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        }
+        else {
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        }
+
+        // Step 2: apply the hadamard transform
+        int ld = (left) ? m : n;
+        if(layout == blas::Layout::ColMajor){
+        RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+        }
+        else {
+        RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+        }
+
+        // Step 3: Permuting
+        std::vector<int64_t> indices(d);
+
+        std::iota(indices.begin(), indices.end(), 1);
+        if(left) {
+            if(layout == blas::Layout::RowMajor)
+            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            else
+            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+        }
+        else {
+            if(layout == blas::Layout::RowMajor)
+            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            else
+            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+        }
+
+        //// Performing D H \Pi
+
+        //Step 1: Un-permute
+        std::reverse(indices.begin(), indices.end());
+
+        if(left) {
+            if(layout == blas::Layout::RowMajor)
+            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            else
+            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+        }
+        else {
+            if(layout == blas::Layout::RowMajor)
+            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            else
+            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+        }
+
+        // Step-2: Apply H^{-1}
+        if(layout == blas::Layout::ColMajor) {
+        RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+        A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
+        }
+        else {
+        RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+        A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
+        }
+
+        //Step-3: Inverting `D`
+        if(layout == blas::Layout::RowMajor) {
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        }
+        else {
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        }
+
+        double norm_inverse = 0.0;
+
+        if(layout == blas::Layout::RowMajor) {
+        norm_inverse = (A_row - B).norm();
+        }
+        else {
+        norm_inverse = (A_col - B).norm();
+        }
+
+        randblas_require(norm_inverse < epsilon);
+
+
+    }
 };
 
 ////////////////////////////////////////////////////////////////////////
@@ -397,3 +508,51 @@ TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
             blas::Layout::RowMajor
         );
 }
+
+TEST_F(TestLMIGET, test_inverse_left_colmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_right_colmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            100,
+            128,
+            25,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_left_rowmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_right_rowmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            100,
+            128,
+            25,
+            false,
+            blas::Layout::RowMajor
+        );
+}

From 8b04df1e9aa00fd576c115e1dc72f3c3aefe70b8 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Thu, 17 Oct 2024 06:28:51 -0700
Subject: [PATCH 21/26] Cleaning up comments and stylistic changes

---
 test/test_matmul_cores/test_trig.cc | 48 ++++++++++++++---------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 01b8913..3283764 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -250,43 +250,41 @@ class TestLMIGET : public::testing::Test
         uint32_t seed,
         int64_t m, // Generated data matrix, `A` is of size `(m x n)`
         int64_t n,
-        int64_t d,
+        int64_t d, // #rows/cols that will be permuted
         bool left,
         blas::Layout layout,
         double epsilon=1e-5
     ) {
         // Grabbing a random matrix
         std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
-        // Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col(m, n);
         Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
-        // A_col.setRandom();
         Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
 
         // Deep copy
         MatrixXd B;
         if(layout == blas::Layout::RowMajor)
-        B = A_row;
+            B = A_row;
         else
-        B = A_col;
+            B = A_col;
 
         //// Performing \Pi H D
         // Step 1: setup the diagonal scaling
         std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
 
         if(layout == blas::Layout::RowMajor) {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
         }
         else {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
         }
 
         // Step 2: apply the hadamard transform
         int ld = (left) ? m : n;
         if(layout == blas::Layout::ColMajor){
-        RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
         }
         else {
-        RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
         }
 
         // Step 3: Permuting
@@ -295,15 +293,15 @@ class TestLMIGET : public::testing::Test
         std::iota(indices.begin(), indices.end(), 1);
         if(left) {
             if(layout == blas::Layout::RowMajor)
-            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
             else
-            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
         }
         else {
             if(layout == blas::Layout::RowMajor)
-            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
             else
-            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
         }
 
         //// Performing D H \Pi
@@ -313,42 +311,42 @@ class TestLMIGET : public::testing::Test
 
         if(left) {
             if(layout == blas::Layout::RowMajor)
-            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
             else
-            RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
         }
         else {
             if(layout == blas::Layout::RowMajor)
-            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
             else
-            RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
         }
 
         // Step-2: Apply H^{-1}
         if(layout == blas::Layout::ColMajor) {
-        RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
-        A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
+            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+            A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
         }
         else {
-        RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
-        A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
+            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+            A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
         }
 
         //Step-3: Inverting `D`
         if(layout == blas::Layout::RowMajor) {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
         }
         else {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
         }
 
         double norm_inverse = 0.0;
 
         if(layout == blas::Layout::RowMajor) {
-        norm_inverse = (A_row - B).norm();
+            norm_inverse = (A_row - B).norm();
         }
         else {
-        norm_inverse = (A_col - B).norm();
+            norm_inverse = (A_col - B).norm();
         }
 
         randblas_require(norm_inverse < epsilon);

From 6b7101c9b644fad11627ad73392c5360b4abe0d1 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Wed, 23 Oct 2024 10:32:00 -0700
Subject: [PATCH 22/26] Review

---
 RandBLAS/trig_skops.hh              |  97 ++++++++++---------------
 test/test_matmul_cores/test_trig.cc | 106 +++++++++++++++-------------
 2 files changed, 97 insertions(+), 106 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 804fe89..22f8311 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -27,30 +27,7 @@ namespace RandBLAS {
 
     // Generates a vector of Rademacher entries using the Random123 library
     template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
-    void generate_rademacher_vector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
-        RNG rng;
-
-        typename RNG::ctr_type c;
-        typename RNG::key_type key = {{key_seed}};
-
-        // Sequential loop for generating Rademacher entries
-        for (int64_t i = 0; i < n; ++i) {
-            // Set the counter for each random number
-            c[0] = ctr_seed + i;  // Ensure each counter is unique
-
-            // Generate a 2x32-bit random number using the Philox generator
-            typename RNG::ctr_type r = rng(c, key);
-
-            // Convert the random number into a float in [0, 1) using u01fixedpt
-            float rand_value = r123::u01fixedpt<float>(r.v[0]);
-
-            // Convert the float into a Rademacher entry (-1 or 1)
-            buff[i] = rand_value < 0.5 ? -1 : 1;
-        }
-    }
-
-    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
-    RandBLAS::RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RandBLAS::RNGState<RNG> seed_state) {
+    RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RNGState<RNG> seed_state) {
         RNG rng;
         auto [ctr, key] = seed_state;
 
@@ -65,14 +42,14 @@ namespace RandBLAS {
         }
 
         // Return the updated RNGState (with the incremented counter)
-        return RandBLAS::RNGState<RNG> {ctr, key};
+        return RNGState<RNG> {ctr, key};
     }
 
     // Catch-all method for applying the diagonal Rademacher
     // entries in-place to an input matrix, `A`
     template<typename T, SignedInteger sint_t>
     void apply_diagonal_rademacher(
-                                bool left,
+                                bool left, // Pre-multiplying?
                                 blas::Layout layout,
                                 int64_t rows,
                                 int64_t cols,
@@ -82,37 +59,37 @@ namespace RandBLAS {
         //TODO: Investigate better schemes for performing the scaling
         //TODO: Move to `RandBLAS/util.hh`
         if(left && layout == blas::Layout::ColMajor) {
-            for(int64_t col=0; col < cols; col++) {
-                if(diag[col] > 0)
+            for(int64_t row = 0; row < rows; row++) {
+                if(diag[row] > 0)
                     continue;
-                blas::scal(rows, diag[col], &A[col * rows], 1);
+                blas::scal(cols, diag[row], &A[row], rows);
             }
         }
         else if(left && layout == blas::Layout::RowMajor) {
-            for(int64_t col=0; col < cols; col++) {
-                if(diag[col] > 0)
+            for(int64_t row = 0; row < rows; row++) {
+                if(diag[row] > 0)
                     continue;
-                blas::scal(rows, diag[col], &A[col], cols);
+                blas::scal(cols, diag[row], &A[row * cols], 1);
             }
         }
         else if(!left && layout == blas::Layout::ColMajor) {
-            for(int64_t row = 0; row < rows; row++) {
-                if(diag[row] > 0)
+            for(int64_t col=0; col < cols; col++) {
+                if(diag[col] > 0)
                     continue;
-                blas::scal(cols, diag[row], &A[row], rows);
+                blas::scal(rows, diag[col], &A[col * rows], 1);
             }
         }
         else {
-            for(int64_t row = 0; row < rows; row++) {
-                if(diag[row] > 0)
+            for(int64_t col=0; col < cols; col++) {
+                if(diag[col] > 0)
                     continue;
-                blas::scal(cols, diag[row], &A[row * cols], 1);
+                blas::scal(rows, diag[col], &A[col], cols);
             }
         }
     }
 
     template<typename T, SignedInteger sint_t>
-    void permuteRowsToTop(
+    void permute_rows_to_top(
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
@@ -147,7 +124,7 @@ namespace RandBLAS {
     }
 
     template<typename T, SignedInteger sint_t>
-    void permuteColsToLeft(
+    void permute_cols_to_left(
                           blas::Layout layout,
                           int64_t rows,
                           int64_t cols,
@@ -316,22 +293,22 @@ namespace RandBLAS {
 
     template <typename T>
     void fht_dispatch(
-        bool left,
+        bool left, // Pre-multiplying?
         blas::Layout layout,
-        T* buff,
-        int64_t log_n,
         int64_t num_rows,
-        int64_t num_cols
+        int64_t num_cols,
+        int64_t log_n,
+        T* A
         )
     {
         if(left && layout == blas::Layout::ColMajor)
-            fht_left_col_major(buff, log_n, num_rows, num_cols);
+            fht_left_col_major(A, log_n, num_rows, num_cols);
         else if(left && layout == blas::Layout::RowMajor)
-            fht_left_row_major(buff, log_n, num_rows, num_cols);
+            fht_left_row_major(A, log_n, num_rows, num_cols);
         else if(!left && layout == blas::Layout::ColMajor)
-            fht_right_col_major(buff, log_n, num_rows, num_cols);
+            fht_right_col_major(A, log_n, num_rows, num_cols);
         else
-            fht_right_row_major(buff, log_n, num_rows, num_cols);
+            fht_right_row_major(A, log_n, num_rows, num_cols);
     }
 }
 
@@ -345,9 +322,9 @@ namespace RandBLAS::trig {
  * d: The number of rows/columns that will be permuted by the action of $\Pi$
  */
 template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
-inline void lmiget(
+inline RandBLAS::RNGState<RNG> lmiget(
     blas::Layout layout,
-    RandBLAS::RNGState<RNG> random_state,
+    const RandBLAS::RNGState<RNG> &random_state,
     int64_t m, // `A` is `(m x n)`
     int64_t n,
     int64_t d, // `d` is the number of rows that have to be permuted by `\Pi`
@@ -362,11 +339,11 @@ inline void lmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    generate_rademacher_vector_r123(diag, n, random_state);
     apply_diagonal_rademacher(true, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
-    fht_dispatch(true, layout, A, std::log2(MAX(m, n)), m, n);
+    fht_dispatch(true, layout, m, n, std::log2(MAX(m, n)), A);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder
@@ -384,17 +361,19 @@ inline void lmiget(
         vals.data()         // Placeholder
     );
 
-    permuteRowsToTop(layout, m, n, selected_rows, d, A);
+    permute_rows_to_top(layout, m, n, selected_rows, d, A);
 
     free(diag);
     free(selected_rows);
+
+    return next_state;
 }
 
 
 template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
-inline void rmiget(
+inline RandBLAS::RNGState<RNG> rmiget(
     blas::Layout layout,
-    RandBLAS::RNGState<RNG> random_state,
+    const RandBLAS::RNGState<RNG> &random_state,
     int64_t m, // `A` is `(m x n)`
     int64_t n,
     int64_t d, // `d` is the number of cols that have to be permuted by `\Pi`
@@ -410,11 +389,11 @@ inline void rmiget(
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    generate_rademacher_vector_r123(diag, n, random_state);
     apply_diagonal_rademacher(false, layout, m, n, A, diag);
 
     //Step 2: Apply the Hadamard transform
-    fht_dispatch(false, layout, A, std::log2(MAX(m, n)), m, n);
+    fht_dispatch(false, layout, m, n, std::log2(MAX(m, n)), A);
 
     //Step 3: Permute the rows
     std::vector<sint_t> idxs_minor(d); // Placeholder
@@ -432,9 +411,11 @@ inline void rmiget(
         vals.data()         // Placeholder
     );
 
-    permuteColsToLeft(layout, m, n, selected_cols, d, A);
+    permute_cols_to_left(layout, m, n, selected_cols, d, A);
 
     free(diag);
     free(selected_cols);
+
+    return next_state;
 }
 }
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 3283764..d5dd9f1 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -42,8 +42,6 @@ using RandBLAS::trig::lmiget;
 using RandBLAS::trig::rmiget;
 using RandBLAS::generate_rademacher_vector_r123;
 using RandBLAS::apply_diagonal_rademacher;
-using RandBLAS::permuteRowsToTop;
-using RandBLAS::permuteColsToLeft;
 using RandBLAS::fht_dispatch;
 using Eigen::MatrixXd;
 
@@ -59,13 +57,14 @@ class TestLMIGET : public::testing::Test
 
     // Helper function for explicitly generating a Hadamard matrix
     // (Note that ColMajor and RowMajor storage is identical for `H`)
-    static std::vector<double> generate_hadamard(int64_t log_n) {
+    template <typename T>
+    static std::vector<T> generate_hadamard(int64_t log_n) {
         int64_t size = 1 << log_n;  // size = 2^n
-        std::vector<std::vector<double>> H(size, std::vector<double>(size, 1));  // Initialize H_1
+        std::vector<std::vector<T>> H(size, std::vector<T>(size, 1));  // Initialize H_1
 
         // Sylvester's construction: recursively build the matrix
         for (int n = 1; n <= log_n; ++n) {
-            double curr_size = 1 << n;  // Current size of the matrix is 2^n
+            T curr_size = 1 << n;  // Current size of the matrix is 2^n
             for (int i = 0; i < curr_size / 2; ++i) {
                 for (int j = 0; j < curr_size / 2; ++j) {
                     // Fill the bottom-left and bottom-right quadrants
@@ -77,7 +76,7 @@ class TestLMIGET : public::testing::Test
         }
 
         // Flatten into a vector in ColMajor order
-        std::vector<double> H_flat(size * size);
+        std::vector<T> H_flat(size * size);
 
         for (int col = 0; col < size; ++col) {
             for (int row = 0; row < size; ++row) {
@@ -88,16 +87,17 @@ class TestLMIGET : public::testing::Test
         return H_flat;
     }
 
-    static std::vector<double> generate_random_vector(int size, double lower_bound, double upper_bound) {
+    template <typename T>
+    static std::vector<T> generate_random_vector(int size, T lower_bound, T upper_bound) {
         // Create a random device and seed the random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
 
-        // Define the distribution range for the random doubles
+        // Define the distribution range for the random Ts
         std::uniform_real_distribution<> dist(lower_bound, upper_bound);
 
         // Create a vector of the specified size
-        std::vector<double> random_vector(size);
+        std::vector<T> random_vector(size);
 
         // Generate random doubles and fill the vector
         for (int i = 0; i < size; ++i) {
@@ -118,10 +118,10 @@ class TestLMIGET : public::testing::Test
         int64_t n,
         bool left,
         blas::Layout layout,
-        double epsilon=1e-5
+        T epsilon=1e-5
     ) {
         // Grabbing a random matrix
-        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
         Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
         Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
 
@@ -161,19 +161,19 @@ class TestLMIGET : public::testing::Test
 
             if(left) {
                 if(layout == blas::Layout::RowMajor)
-                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_row.data());
+                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_row.data());
                 else
-                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_col.data());
+                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_col.data());
             }
             else {
                 if(layout == blas::Layout::RowMajor)
-                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_row.data());
+                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_row.data());
                 else
-                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_col.data());
+                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_col.data());
             }
 
             // Or just do A.isApprox(B)
-            double norm_permute = 0.0;
+            T norm_permute = 0.0;
             if(left) {
                 if(layout == blas::Layout::RowMajor)
                     norm_permute = (A_row - perm * B).norm();
@@ -197,16 +197,16 @@ class TestLMIGET : public::testing::Test
 
             int ld = (left) ? m : n;
             if(layout == blas::Layout::ColMajor)
-                RandBLAS::fht_dispatch(left, layout, A_col.data(), std::log2(ld), m, n);
+                RandBLAS::fht_dispatch(left, layout, m, n, std::log2(ld), A_col.data());
             else
-                RandBLAS::fht_dispatch(left, layout, A_row.data(), std::log2(ld), m, n);
+                RandBLAS::fht_dispatch(left, layout, m, n, std::log2(ld), A_row.data());
 
-            std::vector<double> H_vec = generate_hadamard(std::log2(ld));
+            std::vector<T> H_vec = generate_hadamard<double>(std::log2(ld));
             //TODO: Should have a check here to enforce that `m` and `n` are powers of 2 (since
             // my `generate_hadamard` function does not take care to pad an input matrix)
             MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(ld))), int(std::pow(2, std::log2(ld))));
 
-            double norm_hadamard = 0.0;
+            T norm_hadamard = 0.0;
             if(left) {
             if(layout == blas::Layout::RowMajor)
                 norm_hadamard = (A_row - H * B).norm();
@@ -228,7 +228,7 @@ class TestLMIGET : public::testing::Test
             // Scales all rows/cols by -1 and checks if A == -A
             std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
 
-            double norm_diag = 0.0;
+            T norm_diag = 0.0;
             if(layout == blas::Layout::RowMajor) {
             RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
             norm_diag = (A_row + B).norm();
@@ -250,41 +250,43 @@ class TestLMIGET : public::testing::Test
         uint32_t seed,
         int64_t m, // Generated data matrix, `A` is of size `(m x n)`
         int64_t n,
-        int64_t d, // #rows/cols that will be permuted
+        int64_t d,
         bool left,
         blas::Layout layout,
-        double epsilon=1e-5
+        T epsilon=1e-5
     ) {
         // Grabbing a random matrix
-        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        // Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col(m, n);
         Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        // A_col.setRandom();
         Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
 
         // Deep copy
         MatrixXd B;
         if(layout == blas::Layout::RowMajor)
-            B = A_row;
+        B = A_row;
         else
-            B = A_col;
+        B = A_col;
 
         //// Performing \Pi H D
         // Step 1: setup the diagonal scaling
-        std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
+        std::vector<sint_t> buff = left ? std::vector<sint_t>(m, -1) : std::vector<sint_t>(n, -1);
 
         if(layout == blas::Layout::RowMajor) {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
         }
         else {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
         }
 
         // Step 2: apply the hadamard transform
         int ld = (left) ? m : n;
         if(layout == blas::Layout::ColMajor){
-            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_col.data());
         }
         else {
-            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_row.data());
         }
 
         // Step 3: Permuting
@@ -293,15 +295,15 @@ class TestLMIGET : public::testing::Test
         std::iota(indices.begin(), indices.end(), 1);
         if(left) {
             if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_row.data());
             else
-                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_col.data());
         }
         else {
             if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_row.data());
             else
-                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_col.data());
         }
 
         //// Performing D H \Pi
@@ -311,42 +313,42 @@ class TestLMIGET : public::testing::Test
 
         if(left) {
             if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_row.data());
             else
-                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_col.data());
         }
         else {
             if(layout == blas::Layout::RowMajor)
-                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_row.data());
             else
-                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_col.data());
         }
 
         // Step-2: Apply H^{-1}
         if(layout == blas::Layout::ColMajor) {
-            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_col.data());
             A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
         }
         else {
-            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_row.data());
             A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
         }
 
         //Step-3: Inverting `D`
         if(layout == blas::Layout::RowMajor) {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
         }
         else {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
         }
 
-        double norm_inverse = 0.0;
+        T norm_inverse = 0.0;
 
         if(layout == blas::Layout::RowMajor) {
-            norm_inverse = (A_row - B).norm();
+        norm_inverse = (A_row - B).norm();
         }
         else {
-            norm_inverse = (A_col - B).norm();
+        norm_inverse = (A_col - B).norm();
         }
 
         randblas_require(norm_inverse < epsilon);
@@ -507,12 +509,20 @@ TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
         );
 }
 
+////////////////////////////////////////////////////////////////////////
+//
+//
+//      Verifying invertibility of the transform
+//
+//
+////////////////////////////////////////////////////////////////////////
+
 TEST_F(TestLMIGET, test_inverse_left_colmajor) {
     for(uint32_t seed: keys)
         inverse_transform<double>(
             seed,
             128,
-            128,
+            100,
             25,
             true,
             blas::Layout::ColMajor
@@ -536,7 +546,7 @@ TEST_F(TestLMIGET, test_inverse_left_rowmajor) {
         inverse_transform<double>(
             seed,
             128,
-            128,
+            100,
             25,
             true,
             blas::Layout::RowMajor

From 0bcd76e93675ba3e6ec9e3ea39eaeab9d97ff836 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Fri, 25 Oct 2024 13:09:27 -0700
Subject: [PATCH 23/26] `HadamardMixingOp`, `lmiget|rmiget` -> `miget`

---
 RandBLAS/trig_skops.hh              | 213 ++++++++++++++++------------
 test/test_matmul_cores/test_trig.cc | 127 ++++++++++++++++-
 2 files changed, 246 insertions(+), 94 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index 22f8311..f2a5245 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -10,23 +10,24 @@
 #include <stdio.h>
 #include <stdexcept>
 #include <string>
+#include <sys/_types/_int64_t.h>
 #include <tuple>
 
 #include <math.h>
 #include <typeinfo>
+#include <vector>
 
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
 namespace RandBLAS {
-
     // =============================================================================
     /// WARNING: None of the following functions or overloads thereof are part of the
     /// public API
     ///
 
     // Generates a vector of Rademacher entries using the Random123 library
-    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    template<SignedInteger sint_t = int64_t, typename RNG = DefaultRNG>
     RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RNGState<RNG> seed_state) {
         RNG rng;
         auto [ctr, key] = seed_state;
@@ -47,7 +48,7 @@ namespace RandBLAS {
 
     // Catch-all method for applying the diagonal Rademacher
     // entries in-place to an input matrix, `A`
-    template<typename T, SignedInteger sint_t>
+    template<typename T, SignedInteger sint_t = int64_t>
     void apply_diagonal_rademacher(
                                 bool left, // Pre-multiplying?
                                 blas::Layout layout,
@@ -88,7 +89,7 @@ namespace RandBLAS {
         }
     }
 
-    template<typename T, SignedInteger sint_t>
+    template<typename T, SignedInteger sint_t = int64_t>
     void permute_rows_to_top(
                           blas::Layout layout,
                           int64_t rows,
@@ -102,28 +103,28 @@ namespace RandBLAS {
         //TODO: discuss precise semantics of `selected_rows` in this function
         if(layout == blas::Layout::ColMajor) {
             for (int64_t i=0; i < d; i++) {
-                randblas_error_if_msg(selected_rows[i] == top,
-                                      "The list of provided indices should be unique");
                 if (selected_rows[i] != top) {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
                     blas::swap(cols, &A[top], rows, &A[selected_rows[i]], rows);
                 }
+                else
+                    continue;
             }
         }
         else {
             // For `RowMajor` ordering
             for (int64_t i=0; i < d; i++) {
-                randblas_error_if_msg(selected_rows[i] == top,
-                                      "The list of provided indices should be unique");
                 if (selected_rows[i] != top) {
                     blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
                 }
+                else
+                    continue;
             }
         }
     }
 
-    template<typename T, SignedInteger sint_t>
+    template<typename T, SignedInteger sint_t = int64_t>
     void permute_cols_to_left(
                           blas::Layout layout,
                           int64_t rows,
@@ -141,6 +142,8 @@ namespace RandBLAS {
                     // Swapping col 'selected' with col 'top'
                     blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
                 }
+                else
+                    continue;
             }
         }
         else {
@@ -149,6 +152,8 @@ namespace RandBLAS {
                 if (selected_cols[i] != left) {
                     blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
+                else
+                    continue;
             }
         }
     }
@@ -314,6 +319,86 @@ namespace RandBLAS {
 
 
 namespace RandBLAS::trig {
+template <SignedInteger sint_t = int64_t>
+struct HadamardMixingOp{
+    sint_t* diag_scale;
+    sint_t* selected_idxs;
+    blas::Layout layout;
+    int64_t m;
+    int64_t n;
+    int64_t d;
+    bool left;
+    bool filled = false; // will be updated by `miget`
+
+    // Constructor
+    HadamardMixingOp(bool left,
+                     blas::Layout layout,
+                     int64_t m,
+                     int64_t n,
+                     int64_t d
+                    ) : left(left), layout(layout), m(m), n(n), d(d) {
+        if(left)
+            diag_scale = new sint_t[m];
+        else
+            diag_scale = new sint_t[n];
+        selected_idxs = new sint_t[d];
+    }
+
+    // Destructor
+    ~HadamardMixingOp() {
+        free(this->diag_scale);
+        free(this->selected_idxs);
+    }
+
+    private:
+};
+
+/*
+ * A free-function that performs an inversion of a matrix transformed by `lmiget | rmiget`
+ * the inversion is also performed in-place
+*/
+template <typename T, SignedInteger sint_t = int64_t>
+void invert(
+    HadamardMixingOp<sint_t> &hmo, // details about the transform
+    T* SA // sketched matrix
+) {
+    // We have to make sure we apply the operation in the inverted order
+    // with the operations appropriately inverted
+    randblas_error_if_msg(!hmo.filled, "You are trying to call `invert` on an untransformed matrix,\
+                                       please call `miget` on your matrix before calling `invert`");
+
+    // Creating a vector out of `selected_idxs` to be able to conveniently reverse
+    std::vector<sint_t> selected_idxs(hmo.selected_idxs, hmo.selected_idxs + hmo.d);
+    // Reversing the indices for performing the inverse
+    std::reverse(selected_idxs.begin(), selected_idxs.end());
+
+    //Step 1: Permute the rows/cols
+        // Perform the permutation (for the way we define permutations
+        // invSelected_idxs = reverse(Selected_idxs))
+    if(hmo.left)
+        permute_rows_to_top(hmo.layout, hmo.m, hmo.n, selected_idxs.data(), hmo.d, SA);
+    else
+        permute_cols_to_left(hmo.layout, hmo.m, hmo.n, selected_idxs.data(), hmo.d, SA);
+
+    //Step 2: Apply the Hadamard transform (invH = H.T = H)
+    int ld = (hmo.left) ? hmo.m : hmo.n;
+
+    T log_sz = std::log2(ld);
+    T log_int_sz, log_final_sz;
+    T log_frac_sz = std::modf(log_sz, &log_int_sz);
+
+    if(log_frac_sz < 1e-3)
+        log_final_sz = log_int_sz;
+    else
+        log_final_sz = log_int_sz + 1;
+
+    fht_dispatch(hmo.left, hmo.layout, hmo.m, hmo.n, log_final_sz, SA);
+    blas::scal(hmo.m * hmo.n, 1/std::pow(2, int(std::log2(ld))), SA, 1);
+
+    //Step 3: Scale with `D` (invD = D for rademacher entries)
+    apply_diagonal_rademacher(hmo.left, hmo.layout, hmo.m, hmo.n, SA, hmo.diag_scale);
+}
+
 /*
  * These functions apply an in-place, SRHT-like transform to the input matrix
  * i.e. A <- (\Pi H D)A OR A <- A(D H \Pi) (which is equivalent to A <- A(\Pi H D)^{-1})
@@ -321,100 +406,48 @@ namespace RandBLAS::trig {
  * A: (m x n), input dimensions of `A`
  * d: The number of rows/columns that will be permuted by the action of $\Pi$
  */
-template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
-inline RandBLAS::RNGState<RNG> lmiget(
-    blas::Layout layout,
-    const RandBLAS::RNGState<RNG> &random_state,
-    int64_t m, // `A` is `(m x n)`
-    int64_t n,
-    int64_t d, // `d` is the number of rows that have to be permuted by `\Pi`
-    T* A // data-matrix
+template <typename T, typename RNG = DefaultRNG, SignedInteger sint_t = int64_t>
+inline RNGState<RNG> miget(
+    HadamardMixingOp<sint_t> &hmo, // All information about `A` && the $\mathbb{\Pi\text{RHT}}$
+    const RNGState<RNG> &random_state,
+    T* A // The data-matrix
 ) {
-    // Size of the Rademacher entries = |A_cols|
-    //TODO: Change `diag` to float/doubles (same data type as the matrix)
-    sint_t* diag = new sint_t[n];
-    sint_t* selected_rows = new sint_t[d];
-
     auto [ctr, key] = random_state;
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    generate_rademacher_vector_r123(diag, n, random_state);
-    apply_diagonal_rademacher(true, layout, m, n, A, diag);
+    RNGState<RNG> state_idxs = generate_rademacher_vector_r123(hmo.diag_scale, hmo.n, random_state);
+    apply_diagonal_rademacher(hmo.left, hmo.layout, hmo.m, hmo.n, A, hmo.diag_scale);
 
     //Step 2: Apply the Hadamard transform
-    fht_dispatch(true, layout, m, n, std::log2(MAX(m, n)), A);
+    int ld = (hmo.left) ? hmo.m : hmo.n;
+    T log_sz = std::log2(ld);
+    T log_int_sz, log_final_sz;
+    T log_frac_sz = std::modf(log_sz, &log_int_sz);
 
-    //Step 3: Permute the rows
-    std::vector<sint_t> idxs_minor(d); // Placeholder
-    std::vector<T> vals(d); // Placeholder
-
-    // Populating `selected_rows`
-        //TODO: Do I return this at some point?
-    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
-        random_state,
-        d,         // Number of samples (vec_nnz)
-        m,         // Total number of elements (dim_major)
-        1,         // Single sample round (dim_minor)
-        selected_rows,  // Holds the required output
-        idxs_minor.data(),  // Placeholder
-        vals.data()         // Placeholder
-    );
-
-    permute_rows_to_top(layout, m, n, selected_rows, d, A);
-
-    free(diag);
-    free(selected_rows);
-
-    return next_state;
-}
-
-
-template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
-inline RandBLAS::RNGState<RNG> rmiget(
-    blas::Layout layout,
-    const RandBLAS::RNGState<RNG> &random_state,
-    int64_t m, // `A` is `(m x n)`
-    int64_t n,
-    int64_t d, // `d` is the number of cols that have to be permuted by `\Pi`
-    T* A // data-matrix
-)
-{
-    // Size of the Rademacher entries = |A_cols|
-    //TODO: Change `diag` to float/doubles (same data type as the matrix)
-    sint_t* diag = new sint_t[m];
-    sint_t* selected_cols = new sint_t[d];
-
-    auto [ctr, key] = random_state;
-
-    //Step 1: Scale with `D`
-        //Populating `diag`
-    generate_rademacher_vector_r123(diag, n, random_state);
-    apply_diagonal_rademacher(false, layout, m, n, A, diag);
-
-    //Step 2: Apply the Hadamard transform
-    fht_dispatch(false, layout, m, n, std::log2(MAX(m, n)), A);
+    if(log_frac_sz < 1e-3)
+        log_final_sz = log_int_sz;
+    else
+        log_final_sz = log_int_sz + 1;
+    fht_dispatch(hmo.left, hmo.layout, hmo.m, hmo.n, log_final_sz, A);
 
     //Step 3: Permute the rows
-    std::vector<sint_t> idxs_minor(d); // Placeholder
-    std::vector<T> vals(d); // Placeholder
-
-    // Populating `selected_rows`
-        //TODO: Do I return this at some point?
-    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
-        random_state,
-        d,         // Number of samples (vec_nnz)
-        m,         // Total number of elements (dim_major)
-        1,         // Single sample round (dim_minor)
-        selected_cols,  // Holds the required output
-        idxs_minor.data(),  // Placeholder
-        vals.data()         // Placeholder
+        // Uniformly samples `d` entries from the index set [0, ..., m - 1]
+    RNGState<RNG> next_state = repeated_fisher_yates<sint_t>(
+        hmo.d,
+        hmo.m,
+        1,
+        hmo.selected_idxs,
+        state_idxs
     );
 
-    permute_cols_to_left(layout, m, n, selected_cols, d, A);
+    if(hmo.left)
+        permute_rows_to_top(hmo.layout, hmo.m, hmo.n, hmo.selected_idxs, hmo.d, A);
+    else
+        permute_cols_to_left(hmo.layout, hmo.m, hmo.n, hmo.selected_idxs, hmo.d, A);
 
-    free(diag);
-    free(selected_cols);
+    // `invert` can now be called with this instance of `HadamardMixingOp`
+    hmo.filled = true;
 
     return next_state;
 }
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index d5dd9f1..ca84258 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -27,7 +27,7 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 
-#include "RandBLAS.hh"
+// #include "RandBLAS.hh"
 #include "RandBLAS/base.hh"
 #include "RandBLAS/trig_skops.hh"
 #include <blas.hh>
@@ -53,7 +53,7 @@ class TestLMIGET : public::testing::Test
 
     virtual void TearDown(){};
 
-    inline static std::vector<uint32_t> keys {0, 42};
+    inline static std::vector<uint32_t> keys {42};
 
     // Helper function for explicitly generating a Hadamard matrix
     // (Note that ColMajor and RowMajor storage is identical for `H`)
@@ -118,7 +118,7 @@ class TestLMIGET : public::testing::Test
         int64_t n,
         bool left,
         blas::Layout layout,
-        T epsilon=1e-5
+        T epsilon=RandBLAS::sqrt_epsilon<T>()
     ) {
         // Grabbing a random matrix
         std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
@@ -253,7 +253,7 @@ class TestLMIGET : public::testing::Test
         int64_t d,
         bool left,
         blas::Layout layout,
-        T epsilon=1e-5
+        T epsilon=RandBLAS::sqrt_epsilon<T>()
     ) {
         // Grabbing a random matrix
         std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
@@ -355,6 +355,68 @@ class TestLMIGET : public::testing::Test
 
 
     }
+
+    template <typename T, typename RNG = RandBLAS::DefaultRNG, RandBLAS::SignedInteger sint_t = int64_t>
+    static void drivers_inverse(
+        uint32_t seed,
+        int64_t m,
+        int64_t n,
+        int64_t d,
+        int64_t left,
+        blas::Layout layout,
+        T epsilon=RandBLAS::sqrt_epsilon<T>()
+    ) {
+        // Grabbing a random matrix
+        std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
+        RandBLAS::RNGState<RNG> seed_state(seed);
+
+        // Deep copy
+        MatrixXd B;
+        if(layout == blas::Layout::RowMajor)
+            B = A_row;
+        else
+            B = A_col;
+
+        // Aggregating information about the matrix
+        RandBLAS::trig::HadamardMixingOp<> hmo(
+            left,
+            layout,
+            m,
+            n,
+            d
+        );
+
+        // Sketching the matrix
+        if(layout == blas::Layout::RowMajor) {
+            RandBLAS::trig::miget(hmo, seed_state, A_row.data());
+            RandBLAS::trig::invert(hmo, A_row.data());
+        }
+        else {
+            // std::cout << A_col << std::endl;
+            // std::cout << "<-------x------->" << std::endl;
+            RandBLAS::trig::miget(hmo, seed_state, A_col.data());
+            // for(int i = 0; i < d; i ++)
+            //     std::cout << hmo.selected_idxs[i] << std::endl;
+            // std::cout << A_col << std::endl;
+            // std::cout << "<-------x------->" << std::endl;
+            RandBLAS::trig::invert(hmo, A_col.data());
+            // std::cout << A_col << std::endl;
+            // std::cout << "<-------x------->" << std::endl;
+        }
+
+        T norm_inverse = 0.0;
+
+        if(layout == blas::Layout::RowMajor) {
+            norm_inverse = (A_row - B).norm();
+        }
+        else {
+            norm_inverse = (A_col - B).norm();
+        }
+
+        randblas_require(norm_inverse < epsilon);
+    }
 };
 
 ////////////////////////////////////////////////////////////////////////
@@ -564,3 +626,60 @@ TEST_F(TestLMIGET, test_inverse_right_rowmajor) {
             blas::Layout::RowMajor
         );
 }
+
+////////////////////////////////////////////////////////////////////////
+//
+//
+//      Verifying correctness (in invertibility) of user-facing functions
+//
+//
+////////////////////////////////////////////////////////////////////////
+
+
+TEST_F(TestLMIGET, test_user_inverse_left_colmajor) {
+    for(uint32_t seed: keys)
+        drivers_inverse<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_user_inverse_right_colmajor) {
+    for(uint32_t seed: keys)
+        drivers_inverse<double>(
+            seed,
+            128,
+            128,
+            25,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_user_inverse_left_rowmajor) {
+    for(uint32_t seed: keys)
+        drivers_inverse<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_user_inverse_right_rowmajor) {
+    for(uint32_t seed: keys)
+        drivers_inverse<double>(
+            seed,
+            128,
+            128,
+            25,
+            false,
+            blas::Layout::RowMajor
+        );
+}

From 6da14634d608c7f2ee2f5e886397b4abdba53c46 Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Fri, 25 Oct 2024 13:11:36 -0700
Subject: [PATCH 24/26] Removed comments

---
 test/test_matmul_cores/test_trig.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index ca84258..67555be 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -394,16 +394,8 @@ class TestLMIGET : public::testing::Test
             RandBLAS::trig::invert(hmo, A_row.data());
         }
         else {
-            // std::cout << A_col << std::endl;
-            // std::cout << "<-------x------->" << std::endl;
             RandBLAS::trig::miget(hmo, seed_state, A_col.data());
-            // for(int i = 0; i < d; i ++)
-            //     std::cout << hmo.selected_idxs[i] << std::endl;
-            // std::cout << A_col << std::endl;
-            // std::cout << "<-------x------->" << std::endl;
             RandBLAS::trig::invert(hmo, A_col.data());
-            // std::cout << A_col << std::endl;
-            // std::cout << "<-------x------->" << std::endl;
         }
 
         T norm_inverse = 0.0;

From 6ade6168f442b70e48dff62d6df78f040dc5351c Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Mon, 4 Nov 2024 10:13:59 -0800
Subject: [PATCH 25/26] BIG, removes Eigen-test dep. && review

---
 RandBLAS/trig_skops.hh              |  50 ++-
 test/test_matmul_cores/test_trig.cc | 474 +++++++++++++---------------
 2 files changed, 232 insertions(+), 292 deletions(-)

diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
index f2a5245..7548191 100644
--- a/RandBLAS/trig_skops.hh
+++ b/RandBLAS/trig_skops.hh
@@ -1,7 +1,7 @@
 #include "RandBLAS/base.hh"
 #include "RandBLAS/exceptions.hh"
 #include "RandBLAS/random_gen.hh"
-#include <RandBLAS/sparse_skops.hh>
+#include "RandBLAS/sparse_skops.hh"
 
 #include <Random123/philox.h>
 #include <blas.hh>
@@ -27,13 +27,13 @@ namespace RandBLAS {
     ///
 
     // Generates a vector of Rademacher entries using the Random123 library
-    template<SignedInteger sint_t = int64_t, typename RNG = DefaultRNG>
-    RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RNGState<RNG> seed_state) {
-        RNG rng;
+    template<SignedInteger sint_t = int64_t, typename state_t = RNGState<DefaultRNG>>
+    state_t generate_rademacher_vector_r123(sint_t* buff, int64_t n, state_t &seed_state) {
+        DefaultRNG rng;
         auto [ctr, key] = seed_state;
 
         for (int64_t i = 0; i < n; ++i) {
-            typename RNG::ctr_type r = rng(ctr, key);
+            typename DefaultRNG::ctr_type r = rng(ctr, key);
 
             float rand_value = r123::u01fixedpt<float>(r.v[0]);
 
@@ -43,7 +43,7 @@ namespace RandBLAS {
         }
 
         // Return the updated RNGState (with the incremented counter)
-        return RNGState<RNG> {ctr, key};
+        return state_t {ctr, key};
     }
 
     // Catch-all method for applying the diagonal Rademacher
@@ -107,9 +107,7 @@ namespace RandBLAS {
                     // Use BLAS swap to swap the entire rows
                     // Swapping row 'selected' with row 'top'
                     blas::swap(cols, &A[top], rows, &A[selected_rows[i]], rows);
-                }
-                else
-                    continue;
+                } // else, continue;
             }
         }
         else {
@@ -117,9 +115,7 @@ namespace RandBLAS {
             for (int64_t i=0; i < d; i++) {
                 if (selected_rows[i] != top) {
                     blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
-                }
-                else
-                    continue;
+                } // else, continue;
             }
         }
     }
@@ -142,8 +138,6 @@ namespace RandBLAS {
                     // Swapping col 'selected' with col 'top'
                     blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
                 }
-                else
-                    continue;
             }
         }
         else {
@@ -152,8 +146,6 @@ namespace RandBLAS {
                 if (selected_cols[i] != left) {
                     blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
                 }
-                else
-                    continue;
             }
         }
     }
@@ -208,8 +200,8 @@ namespace RandBLAS {
                     for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
-                        bool b1 = j + k < num_rows;
-                        bool b2 = j + k + s1 < num_rows;
+                        bool b1 = (j + k) * num_cols + col < num_rows * num_cols;
+                        bool b2 = (j + k + s1) * num_cols + col < num_rows * num_cols;
                         T u = b1 ? buf[(j + k) * num_cols + col] : 0;
                         T v = b2 ? buf[(j + k + s1) * num_cols + col] : 0;
                         if(b1 && b2) {
@@ -277,8 +269,8 @@ namespace RandBLAS {
                     for (int64_t k = 0; k < s1; ++k) {
                         // For implicitly padding the input we just have to make sure
                         // we replace all out-of-bounds accesses with zeros
-                        bool b1 = j + k < num_cols;
-                        bool b2 = j + k + s1 < num_cols;
+                        bool b1 = (j + k) * num_rows + row < num_cols * num_rows;
+                        bool b2 = (j + k + s1) * num_rows + row < num_cols * num_rows;
                         T u = b1 ? buf[(j + k) * num_rows + row] : 0;
                         T v = b2 ? buf[(j + k + s1) * num_rows + row] : 0;
                         if(b1 && b2) {
@@ -346,8 +338,8 @@ struct HadamardMixingOp{
 
     // Destructor
     ~HadamardMixingOp() {
-        free(this->diag_scale);
-        free(this->selected_idxs);
+        delete [] this->diag_scale;
+        delete [] this->selected_idxs;
     }
 
     private:
@@ -358,7 +350,7 @@ struct HadamardMixingOp{
  * the inversion is also performed in-place
 */
 template <typename T, SignedInteger sint_t = int64_t>
-void invert(
+void invert_hadamard(
     HadamardMixingOp<sint_t> &hmo, // details about the transform
     T* SA // sketched matrix
 ) {
@@ -406,17 +398,17 @@ void invert(
  * A: (m x n), input dimensions of `A`
  * d: The number of rows/columns that will be permuted by the action of $\Pi$
  */
-template <typename T, typename RNG = DefaultRNG, SignedInteger sint_t = int64_t>
-inline RNGState<RNG> miget(
+template <typename T, typename state_t = RNGState<DefaultRNG>, SignedInteger sint_t = int64_t>
+inline state_t miget(
     HadamardMixingOp<sint_t> &hmo, // All information about `A` && the $\mathbb{\Pi\text{RHT}}$
-    const RNGState<RNG> &random_state,
+    const state_t &random_state,
     T* A // The data-matrix
 ) {
     auto [ctr, key] = random_state;
 
     //Step 1: Scale with `D`
         //Populating `diag`
-    RNGState<RNG> state_idxs = generate_rademacher_vector_r123(hmo.diag_scale, hmo.n, random_state);
+    auto next_state = generate_rademacher_vector_r123(hmo.diag_scale, hmo.n, random_state);
     apply_diagonal_rademacher(hmo.left, hmo.layout, hmo.m, hmo.n, A, hmo.diag_scale);
 
     //Step 2: Apply the Hadamard transform
@@ -433,12 +425,12 @@ inline RNGState<RNG> miget(
 
     //Step 3: Permute the rows
         // Uniformly samples `d` entries from the index set [0, ..., m - 1]
-    RNGState<RNG> next_state = repeated_fisher_yates<sint_t>(
+    next_state = repeated_fisher_yates<sint_t>(
         hmo.d,
         hmo.m,
         1,
         hmo.selected_idxs,
-        state_idxs
+        next_state
     );
 
     if(hmo.left)
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 67555be..8cf04c3 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -27,25 +27,16 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 
-// #include "RandBLAS.hh"
+#include "RandBLAS.hh"
 #include "RandBLAS/base.hh"
 #include "RandBLAS/trig_skops.hh"
+#include "RandBLAS/util.hh"
 #include <blas.hh>
 
-#include <Eigen/Core>
-#include <Eigen/Dense>
 #include <cmath>
 #include <random>
 #include <gtest/gtest.h>
 
-using RandBLAS::trig::lmiget;
-using RandBLAS::trig::rmiget;
-using RandBLAS::generate_rademacher_vector_r123;
-using RandBLAS::apply_diagonal_rademacher;
-using RandBLAS::fht_dispatch;
-using Eigen::MatrixXd;
-
-
 class TestLMIGET : public::testing::Test
 {
     protected:
@@ -55,6 +46,23 @@ class TestLMIGET : public::testing::Test
 
     inline static std::vector<uint32_t> keys {42};
 
+    // Helper function for generating a row-permutation matrix
+    // in ColMajor format
+    //NOTE: Need to deallocate memory explicitly
+    template <typename T>
+    static T* generate_row_permutation_row_major(int size, const std::vector<int>& final_row_order) {
+        // Allocate memory for a size x size matrix and initialize with zeros
+        T* perm_matrix = new T[size * size]();
+
+        // Fill in the matrix with '1's at positions corresponding to the final row order
+        for (int i = 0; i < size; ++i) {
+            int final_row = final_row_order[i];
+            perm_matrix[i * size + final_row] = 1.0;  // Set the '1' in the permuted position (RowMajor)
+        }
+
+        return perm_matrix;
+    }
+
     // Helper function for explicitly generating a Hadamard matrix
     // (Note that ColMajor and RowMajor storage is identical for `H`)
     template <typename T>
@@ -118,26 +126,17 @@ class TestLMIGET : public::testing::Test
         int64_t n,
         bool left,
         blas::Layout layout,
-        T epsilon=RandBLAS::sqrt_epsilon<T>()
+        T tol=RandBLAS::sqrt_epsilon<T>()
     ) {
         // Grabbing a random matrix
         std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
-        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
-
-        // Deep copy
-        MatrixXd B;
-        if(layout == blas::Layout::RowMajor)
-            B = A_row;
-        else
-            B = A_col;
+        std::vector<T> B_vec(A_vec);
 
         switch (transform) {
         case transforms::permute: {
-            // Simply compares against Eigen::PermutationMatrix
-            Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic> perm(5);
+            int ld = (left) ? m : n;
 
-            std::vector<int> V = left ? std::vector<int>(m) : std::vector<int>(n);
+            std::vector<int> V = std::vector<int>(ld);
 
             int cnt = 0;
             // int cnt = 0;
@@ -151,44 +150,101 @@ class TestLMIGET : public::testing::Test
                 cnt++;
             }
 
-            Eigen::VectorXi indices = Eigen::Map<Eigen::VectorXi>(V.data(), V.size());
-
-            // Set the indices in the permutation matrix
-            perm.indices() = indices;
+            T* perm_matrix = generate_row_permutation_row_major<T>(ld, V);
+            T* true_perm_matrix = new T[m * n];
 
             sint_t* v = new sint_t;
             *v = V.size() - 1;
 
             if(left) {
-                if(layout == blas::Layout::RowMajor)
-                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_row.data());
-                else
-                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_col.data());
+                if(layout == blas::Layout::RowMajor) {
+                    blas::gemm(
+                        blas::Layout::RowMajor,
+                        blas::Op::Trans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        m,
+                        1.0,
+                        perm_matrix,
+                        m,
+                        A_vec.data(),
+                        n,
+                        0.0,
+                        true_perm_matrix,
+                        n
+                    );
+                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_vec.data());
+                }
+                else {
+                    blas::gemm(
+                        blas::Layout::ColMajor,
+                        blas::Op::Trans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        m,
+                        1.0,
+                        perm_matrix,
+                        m,
+                        A_vec.data(),
+                        m,
+                        0.0,
+                        true_perm_matrix,
+                        m
+                    );
+                    RandBLAS::permute_rows_to_top(layout, m, n, v, 1, A_vec.data());
+                }
             }
             else {
-                if(layout == blas::Layout::RowMajor)
-                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_row.data());
-                else
-                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_col.data());
+                if(layout == blas::Layout::RowMajor) {
+                    blas::gemm(
+                        blas::Layout::RowMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        n,
+                        1.0,
+                        A_vec.data(),
+                        n,
+                        perm_matrix,
+                        n,
+                        0.0,
+                        true_perm_matrix,
+                        n
+                    );
+                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_vec.data());
+                }
+                else {
+                    blas::gemm(
+                        blas::Layout::ColMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        n,
+                        1.0,
+                        A_vec.data(),
+                        m,
+                        perm_matrix,
+                        n,
+                        0.0,
+                        true_perm_matrix,
+                        m
+                    );
+                    RandBLAS::permute_cols_to_left(layout, m, n, v, 1, A_vec.data());
+                }
             }
 
-            // Or just do A.isApprox(B)
             T norm_permute = 0.0;
-            if(left) {
-                if(layout == blas::Layout::RowMajor)
-                    norm_permute = (A_row - perm * B).norm();
-                else
-                    norm_permute = (A_col - perm * B).norm();
-            }
-            else {
-            if(layout == blas::Layout::RowMajor)
-                norm_permute = (A_row - B * perm).norm();
-            else
-                norm_permute = (A_col - B * perm).norm();
-            }
+            blas::axpy(m * n, -1.0, A_vec.data(), 1, true_perm_matrix, 1);
+            norm_permute = blas::nrm2(m * n, true_perm_matrix, 1);
+
+            randblas_require(norm_permute < tol);
 
-            // Or do A.isApprox(H * B)
-            randblas_require(norm_permute < epsilon);
+            delete [] perm_matrix;
+            delete [] true_perm_matrix;
 
             break;
         }
@@ -196,31 +252,104 @@ class TestLMIGET : public::testing::Test
             // Here, simply check against explicit application of the Hadamard matrix
 
             int ld = (left) ? m : n;
-            if(layout == blas::Layout::ColMajor)
-                RandBLAS::fht_dispatch(left, layout, m, n, std::log2(ld), A_col.data());
-            else
-                RandBLAS::fht_dispatch(left, layout, m, n, std::log2(ld), A_row.data());
+            RandBLAS::fht_dispatch(left, layout, m, n, std::log2(ld), A_vec.data());
 
             std::vector<T> H_vec = generate_hadamard<double>(std::log2(ld));
             //TODO: Should have a check here to enforce that `m` and `n` are powers of 2 (since
             // my `generate_hadamard` function does not take care to pad an input matrix)
-            MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(ld))), int(std::pow(2, std::log2(ld))));
+
+            T* true_H_mult = new T[m * n];
 
             T norm_hadamard = 0.0;
+
             if(left) {
-            if(layout == blas::Layout::RowMajor)
-                norm_hadamard = (A_row - H * B).norm();
-            else
-                norm_hadamard = (A_col - H * B).norm();
+                if(layout == blas::Layout::RowMajor) {
+                    blas::gemm(
+                        blas::Layout::RowMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        m,
+                        1.0,
+                        H_vec.data(),
+                        m,
+                        B_vec.data(),
+                        n,
+                        0.0,
+                        true_H_mult,
+                        n
+                    );
+                    blas::axpy(m * n, -1.0, A_vec.data(), 1, true_H_mult, 1);
+                    norm_hadamard = blas::nrm2(m * n, true_H_mult, 1);
+                }
+                else {
+                    blas::gemm(
+                        blas::Layout::ColMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        m,
+                        1.0,
+                        H_vec.data(),
+                        m,
+                        B_vec.data(),
+                        m,
+                        0.0,
+                        true_H_mult,
+                        m
+                    );
+                    blas::axpy(m * n, -1.0, A_vec.data(), 1, true_H_mult, 1);
+                    norm_hadamard = blas::nrm2(m * n, true_H_mult, 1);
+                }
             }
             else {
-            if(layout == blas::Layout::RowMajor)
-                norm_hadamard = (A_row - B * H).norm();
-            else
-                norm_hadamard = (A_col - B * H).norm();
+                if(layout == blas::Layout::RowMajor) {
+                    blas::gemm(
+                        blas::Layout::RowMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        n,
+                        1.0,
+                        B_vec.data(),
+                        n,
+                        H_vec.data(),
+                        n,
+                        0.0,
+                        true_H_mult,
+                        n
+                    );
+                    blas::axpy(m * n, -1.0, A_vec.data(), 1, true_H_mult, 1);
+                    norm_hadamard = blas::nrm2(m * n, true_H_mult, 1);
+                }
+                else {
+                    blas::gemm(
+                        blas::Layout::ColMajor,
+                        blas::Op::NoTrans,
+                        blas::Op::NoTrans,
+                        m,
+                        n,
+                        n,
+                        1.0,
+                        B_vec.data(),
+                        m,
+                        H_vec.data(),
+                        n,
+                        0.0,
+                        true_H_mult,
+                        m
+                    );
+                    blas::axpy(m * n, -1.0, A_vec.data(), 1, true_H_mult, 1);
+                    norm_hadamard = blas::nrm2(m * n, true_H_mult, 1);
+                }
             }
 
-            randblas_require(norm_hadamard < epsilon);
+            randblas_require(norm_hadamard < tol);
+
+            delete [] true_H_mult;
 
             break;
         }
@@ -228,134 +357,25 @@ class TestLMIGET : public::testing::Test
             // Scales all rows/cols by -1 and checks if A == -A
             std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
 
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_vec.data(), buff.data());
+
             T norm_diag = 0.0;
             if(layout == blas::Layout::RowMajor) {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
-            norm_diag = (A_row + B).norm();
+                blas::axpy(m * n, 1.0, A_vec.data(), 1, B_vec.data(), 1);
+                norm_diag = blas::nrm2(m * n, B_vec.data(), 1);
             }
             else {
-            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
-            norm_diag = (A_col + B).norm();
+                blas::axpy(m * n, 1.0, A_vec.data(), 1, B_vec.data(), 1);
+                norm_diag = blas::nrm2(m * n, B_vec.data(), 1);
             }
 
-            randblas_require(norm_diag < epsilon);
+            randblas_require(norm_diag < tol);
 
             break;
         }
         }
     }
 
-    template <typename T, RandBLAS::SignedInteger sint_t = int64_t>
-    static void inverse_transform(
-        uint32_t seed,
-        int64_t m, // Generated data matrix, `A` is of size `(m x n)`
-        int64_t n,
-        int64_t d,
-        bool left,
-        blas::Layout layout,
-        T epsilon=RandBLAS::sqrt_epsilon<T>()
-    ) {
-        // Grabbing a random matrix
-        std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
-        // Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col(m, n);
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
-        // A_col.setRandom();
-        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
-
-        // Deep copy
-        MatrixXd B;
-        if(layout == blas::Layout::RowMajor)
-        B = A_row;
-        else
-        B = A_col;
-
-        //// Performing \Pi H D
-        // Step 1: setup the diagonal scaling
-        std::vector<sint_t> buff = left ? std::vector<sint_t>(m, -1) : std::vector<sint_t>(n, -1);
-
-        if(layout == blas::Layout::RowMajor) {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
-        }
-        else {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
-        }
-
-        // Step 2: apply the hadamard transform
-        int ld = (left) ? m : n;
-        if(layout == blas::Layout::ColMajor){
-            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_col.data());
-        }
-        else {
-            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_row.data());
-        }
-
-        // Step 3: Permuting
-        std::vector<int64_t> indices(d);
-
-        std::iota(indices.begin(), indices.end(), 1);
-        if(left) {
-            if(layout == blas::Layout::RowMajor)
-            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_row.data());
-            else
-            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_col.data());
-        }
-        else {
-            if(layout == blas::Layout::RowMajor)
-            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_row.data());
-            else
-            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_col.data());
-        }
-
-        //// Performing D H \Pi
-
-        //Step 1: Un-permute
-        std::reverse(indices.begin(), indices.end());
-
-        if(left) {
-            if(layout == blas::Layout::RowMajor)
-            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_row.data());
-            else
-            RandBLAS::permute_rows_to_top(layout, m, n, indices.data(), d, A_col.data());
-        }
-        else {
-            if(layout == blas::Layout::RowMajor)
-            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_row.data());
-            else
-            RandBLAS::permute_cols_to_left(layout, m, n, indices.data(), d, A_col.data());
-        }
-
-        // Step-2: Apply H^{-1}
-        if(layout == blas::Layout::ColMajor) {
-            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_col.data());
-            A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
-        }
-        else {
-            RandBLAS::fht_dispatch(left, layout, m, n, int(std::log2(ld)), A_row.data());
-            A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
-        }
-
-        //Step-3: Inverting `D`
-        if(layout == blas::Layout::RowMajor) {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
-        }
-        else {
-        RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
-        }
-
-        T norm_inverse = 0.0;
-
-        if(layout == blas::Layout::RowMajor) {
-        norm_inverse = (A_row - B).norm();
-        }
-        else {
-        norm_inverse = (A_col - B).norm();
-        }
-
-        randblas_require(norm_inverse < epsilon);
-
-
-    }
-
     template <typename T, typename RNG = RandBLAS::DefaultRNG, RandBLAS::SignedInteger sint_t = int64_t>
     static void drivers_inverse(
         uint32_t seed,
@@ -364,20 +384,13 @@ class TestLMIGET : public::testing::Test
         int64_t d,
         int64_t left,
         blas::Layout layout,
-        T epsilon=RandBLAS::sqrt_epsilon<T>()
+        T tol=RandBLAS::sqrt_epsilon<T>()
     ) {
         // Grabbing a random matrix
         std::vector<T> A_vec = generate_random_vector(m * n, 0.0, 10.0);
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
-        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
-        RandBLAS::RNGState<RNG> seed_state(seed);
+        std::vector<T> B_vec(A_vec);
 
-        // Deep copy
-        MatrixXd B;
-        if(layout == blas::Layout::RowMajor)
-            B = A_row;
-        else
-            B = A_col;
+        RandBLAS::RNGState<RNG> seed_state(seed);
 
         // Aggregating information about the matrix
         RandBLAS::trig::HadamardMixingOp<> hmo(
@@ -388,26 +401,17 @@ class TestLMIGET : public::testing::Test
             d
         );
 
-        // Sketching the matrix
-        if(layout == blas::Layout::RowMajor) {
-            RandBLAS::trig::miget(hmo, seed_state, A_row.data());
-            RandBLAS::trig::invert(hmo, A_row.data());
-        }
-        else {
-            RandBLAS::trig::miget(hmo, seed_state, A_col.data());
-            RandBLAS::trig::invert(hmo, A_col.data());
-        }
+        // Performing the transform and...
+        RandBLAS::trig::miget(hmo, seed_state, A_vec.data());
+        RandBLAS::trig::invert_hadamard(hmo, A_vec.data());
+        // ... inverting
 
         T norm_inverse = 0.0;
 
-        if(layout == blas::Layout::RowMajor) {
-            norm_inverse = (A_row - B).norm();
-        }
-        else {
-            norm_inverse = (A_col - B).norm();
-        }
+        blas::axpy(m * n, -1.0, A_vec.data(), 1, B_vec.data(), 1);
+        norm_inverse = blas::nrm2(m * n, B_vec.data(), 1);
 
-        randblas_require(norm_inverse < epsilon);
+        randblas_require(norm_inverse < tol);
     }
 };
 
@@ -563,62 +567,6 @@ TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
         );
 }
 
-////////////////////////////////////////////////////////////////////////
-//
-//
-//      Verifying invertibility of the transform
-//
-//
-////////////////////////////////////////////////////////////////////////
-
-TEST_F(TestLMIGET, test_inverse_left_colmajor) {
-    for(uint32_t seed: keys)
-        inverse_transform<double>(
-            seed,
-            128,
-            100,
-            25,
-            true,
-            blas::Layout::ColMajor
-        );
-}
-
-TEST_F(TestLMIGET, test_inverse_right_colmajor) {
-    for(uint32_t seed: keys)
-        inverse_transform<double>(
-            seed,
-            100,
-            128,
-            25,
-            false,
-            blas::Layout::ColMajor
-        );
-}
-
-TEST_F(TestLMIGET, test_inverse_left_rowmajor) {
-    for(uint32_t seed: keys)
-        inverse_transform<double>(
-            seed,
-            128,
-            100,
-            25,
-            true,
-            blas::Layout::RowMajor
-        );
-}
-
-TEST_F(TestLMIGET, test_inverse_right_rowmajor) {
-    for(uint32_t seed: keys)
-        inverse_transform<double>(
-            seed,
-            100,
-            128,
-            25,
-            false,
-            blas::Layout::RowMajor
-        );
-}
-
 ////////////////////////////////////////////////////////////////////////
 //
 //

From 09ea44d3bd8196c1fc71509b309917b6acf0094a Mon Sep 17 00:00:00 2001
From: Aryaman Jeendgar <jeendgararyaman@gmail.com>
Date: Mon, 4 Nov 2024 18:10:26 -0800
Subject: [PATCH 26/26] Simplifying correctness test for `diag`

---
 test/test_matmul_cores/test_trig.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
index 8cf04c3..e00b141 100644
--- a/test/test_matmul_cores/test_trig.cc
+++ b/test/test_matmul_cores/test_trig.cc
@@ -360,14 +360,8 @@ class TestLMIGET : public::testing::Test
             RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_vec.data(), buff.data());
 
             T norm_diag = 0.0;
-            if(layout == blas::Layout::RowMajor) {
-                blas::axpy(m * n, 1.0, A_vec.data(), 1, B_vec.data(), 1);
-                norm_diag = blas::nrm2(m * n, B_vec.data(), 1);
-            }
-            else {
-                blas::axpy(m * n, 1.0, A_vec.data(), 1, B_vec.data(), 1);
-                norm_diag = blas::nrm2(m * n, B_vec.data(), 1);
-            }
+            blas::axpy(m * n, 1.0, A_vec.data(), 1, B_vec.data(), 1);
+            norm_diag = blas::nrm2(m * n, B_vec.data(), 1);
 
             randblas_require(norm_diag < tol);