From 8d992a498e22f2e6a17f8382f146377ca5cc6517 Mon Sep 17 00:00:00 2001 From: Riley Murray Date: Sat, 3 Aug 2024 17:05:17 -0400 Subject: [PATCH] add Kolmogorov-Smirnov tests for uniform distribution over [-1, 1]. Fix small but substantial bug in sample_size_rep. Fix bounds error in KS test for sampling indices. --- RandBLAS/dense_skops.hh | 2 +- test/test_basic_rng/rng_common.hh | 19 +++- test/test_basic_rng/test_continuous.cc | 115 +++++++++++++++++++------ test/test_basic_rng/test_discrete.cc | 9 +- 4 files changed, 112 insertions(+), 33 deletions(-) diff --git a/RandBLAS/dense_skops.hh b/RandBLAS/dense_skops.hh index 26028570..4bec8153 100644 --- a/RandBLAS/dense_skops.hh +++ b/RandBLAS/dense_skops.hh @@ -579,7 +579,7 @@ RNGState fill_dense( T *buff, const RNGState &seed ) { - return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed); + return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed); } // ============================================================================= diff --git a/test/test_basic_rng/rng_common.hh b/test/test_basic_rng/rng_common.hh index e344d93c..ea1974f1 100644 --- a/test/test_basic_rng/rng_common.hh +++ b/test/test_basic_rng/rng_common.hh @@ -140,7 +140,7 @@ inline int sample_size_rep(int n) { randblas_require(SMALLEST_SAMPLE <= n && n <= LARGEST_SAMPLE); int num_sample_sizes = (int) sample_sizes.size(); for (int i = 0; i < num_sample_sizes; ++i) { - if (sample_sizes[i] <= n) + if (sample_sizes[i] >= n) return i; } // This code shouldn't be reachable! @@ -223,4 +223,21 @@ inline double hypergeometric_variance(int64_t N, int64_t K, int64_t D) { return dD * t1 * t2 * t3; } +// MARK: continuous distributions + +template +inline T standard_normal_cdf(T x) { + double dx = (double) x; + return (T) std::erfc(-dx / std::sqrt(2)) / 2; +} + +template +inline T uniform_neg11_cdf(T x) { + if (x <= -1) + return 0; + if (x >= 1) + return 1; + return (x + 1) / 2; +} + } // end namespace RandBLAS_StatTests diff --git a/test/test_basic_rng/test_continuous.cc b/test/test_basic_rng/test_continuous.cc index d1a32839..cf2ee41e 100644 --- a/test/test_basic_rng/test_continuous.cc +++ b/test/test_basic_rng/test_continuous.cc @@ -30,7 +30,9 @@ #include "RandBLAS/config.h" #include "RandBLAS/base.hh" #include "RandBLAS/util.hh" +#include "RandBLAS/dense_skops.hh" using RandBLAS::RNGState; +using RandBLAS::DenseDistName; #include "rng_common.hh" #include @@ -40,43 +42,102 @@ using RandBLAS::RNGState; #include #include #include +#include -class TestContinuous : public ::testing::Test -{ - protected: - - virtual void SetUp(){}; - virtual void TearDown(){}; +class TestContinuous : public ::testing::Test { + protected: + // This is really for distributions whose CDFs "F" are continuous and strictly increasing + // on the interval [a, b] where F(a) = 0 and F(b) = 1. + + template static void kolmogorov_smirnov_tester( - // std::vector &samples, std::vector &true_cdf, double critical_value - ) { - // TODO: WRITE ME - - // auto num_samples = (int) samples.size(); - // auto N = (int64_t) true_cdf.size(); - // std::vector sample_cdf(N, 0.0); - // for (int64_t s : samples) - // sample_cdf[s] += 1; - // RandBLAS::util::weights_to_cdf(N, sample_cdf.data()); + std::vector &samples, double critical_value, DenseDistName dn + ) { + auto F_true = [dn](T x) { + if (dn == DenseDistName::Gaussian) { + return RandBLAS_StatTests::standard_normal_cdf(x); + } else if (dn == DenseDistName::Uniform) { + return RandBLAS_StatTests::uniform_neg11_cdf(x); + } else { + std::string msg = "Unrecognized distributions name"; + throw std::runtime_error(msg); + } + }; + auto N = (int) samples.size(); + /** + * Let L(x) = |F_empirical(x) - F_true(x)|. The KS test testatistic is + * + * ts = sup_{all x} L(x). + * + * Now set s = sorted(samples), and partition the real line into + * + * I_0 = (-infty, s[0 ]), ... + * I_1 = [s[0 ], s[1 ]), ... + * I_2 = [s[1 ], s[2 ]), ... + * I_{N-1} = [s[N-2], s[N-1]), ... + * I_N = [s[N-1], +infty). + * + * Then, provided F_true is continuous, we have + * + * sup{ L(x) : x in I_j } = max{ + * |F_true(inf(I_j)) - j/N|, |F_true(sup(I_j)) - j/N| + * } + * + * for j = 0, ..., N. + */ + samples.push_back( std::numeric_limits::infinity()); + samples.push_back(-std::numeric_limits::infinity()); + std::sort(samples.begin(), samples.end(), [](T a, T b) {return (a < b);}); + for (int64_t j = 0; j <= N; ++j) { + T temp1 = F_true(samples[j ]); + T temp2 = F_true(samples[j + 1]); + T empirical = ((T)j)/((T)N); + T val1 = std::abs(temp1 - empirical); + T val2 = std::abs(temp2 - empirical); + T supLx_on_Ij = std::max(val1, val2); + ASSERT_LE(supLx_on_Ij, critical_value) + << "\nj = " << j << " of N = " << N + << "\nF_true(inf(I_j)) = " << temp1 + << "\nF_true(sup(I_j)) = " << temp2 + << "\nI_j = [" << samples[j] << ", " << samples[j+1] << ")"; + } + return; + } - // for (int i = 0; i < num_samples; ++i) { - // auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]); - // ASSERT_LT(diff, critical_value); - // } + template + static void run(double significance, int64_t num_samples, DenseDistName dn, uint32_t seed) { + using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator; + auto critical_value = critical_value_rep_mutator(num_samples, significance); + RNGState state(seed); + std::vector samples(num_samples, -1); + RandBLAS::fill_dense({num_samples, 1, dn, RandBLAS::MajorAxis::Long}, samples.data(), state); + kolmogorov_smirnov_tester(samples, critical_value, dn); return; } +}; - template - static void run_uneg11() { +TEST_F(TestContinuous, uneg11_ks_generous) { + double s = 1e-6; + run(s, 100000, DenseDistName::Uniform, 0); + run(s, 10000, DenseDistName::Uniform, 0); + run(s, 1000, DenseDistName::Uniform, 0); +} - } - template - static void run_gaussian() { +TEST_F(TestContinuous, uneg11_ks_moderate) { + double s = 1e-4; + run(s, 100000, DenseDistName::Uniform, 0); + run(s, 10000, DenseDistName::Uniform, 0); + run(s, 1000, DenseDistName::Uniform, 0); // might fail? +} - } -}; +TEST_F(TestContinuous, uneg11_ks_skeptical) { + double s = 1e-2; + run(s, 100000, DenseDistName::Uniform, 0); + run(s, 10000, DenseDistName::Uniform, 0); + run(s, 1000, DenseDistName::Uniform, 0); +} diff --git a/test/test_basic_rng/test_discrete.cc b/test/test_basic_rng/test_discrete.cc index 199ef25e..b9d7ae70 100644 --- a/test/test_basic_rng/test_discrete.cc +++ b/test/test_basic_rng/test_discrete.cc @@ -65,15 +65,16 @@ class TestSampleIndices : public ::testing::Test static void index_set_kolmogorov_smirnov_tester( std::vector &samples, std::vector &true_cdf, double critical_value ) { - auto num_samples = (int) samples.size(); auto N = (int64_t) true_cdf.size(); std::vector sample_cdf(N, 0.0); for (int64_t s : samples) sample_cdf[s] += 1; RandBLAS::util::weights_to_cdf(N, sample_cdf.data()); - for (int i = 0; i < num_samples; ++i) { - auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]); + for (int i = 0; i < N; ++i) { + float F_empirical = sample_cdf[i]; + float F_true = true_cdf[i]; + auto diff = (double) std::abs(F_empirical - F_true); ASSERT_LT(diff, critical_value); } return; @@ -94,7 +95,7 @@ class TestSampleIndices : public ::testing::Test return; } - static void test_iid_kolmogorov_smirnov(int64_t N, int exponent, double significance, int64_t num_samples, uint32_t seed) { + static void test_iid_kolmogorov_smirnov(int64_t N, float exponent, double significance, int64_t num_samples, uint32_t seed) { using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator; auto critical_value = critical_value_rep_mutator(num_samples, significance);