Skip to content

Commit

Permalink
add Kolmogorov-Smirnov tests for uniform distribution over [-1, 1]. F…
Browse files Browse the repository at this point in the history
…ix small but substantial bug in sample_size_rep. Fix bounds error in KS test for sampling indices.
  • Loading branch information
rileyjmurray committed Aug 3, 2024
1 parent 69cdf6f commit 8d992a4
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 33 deletions.
2 changes: 1 addition & 1 deletion RandBLAS/dense_skops.hh
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ RNGState<RNG> fill_dense(
T *buff,
const RNGState<RNG> &seed
) {
return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed);
return fill_dense(dist_to_layout(D), D, D.n_rows, D.n_cols, 0, 0, buff, seed);
}

// =============================================================================
Expand Down
19 changes: 18 additions & 1 deletion test/test_basic_rng/rng_common.hh
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ inline int sample_size_rep(int n) {
randblas_require(SMALLEST_SAMPLE <= n && n <= LARGEST_SAMPLE);
int num_sample_sizes = (int) sample_sizes.size();
for (int i = 0; i < num_sample_sizes; ++i) {
if (sample_sizes[i] <= n)
if (sample_sizes[i] >= n)
return i;
}
// This code shouldn't be reachable!
Expand Down Expand Up @@ -223,4 +223,21 @@ inline double hypergeometric_variance(int64_t N, int64_t K, int64_t D) {
return dD * t1 * t2 * t3;
}

// MARK: continuous distributions

template <typename T>
inline T standard_normal_cdf(T x) {
double dx = (double) x;
return (T) std::erfc(-dx / std::sqrt(2)) / 2;
}

template <typename T>
inline T uniform_neg11_cdf(T x) {
if (x <= -1)
return 0;
if (x >= 1)
return 1;
return (x + 1) / 2;
}

} // end namespace RandBLAS_StatTests
115 changes: 88 additions & 27 deletions test/test_basic_rng/test_continuous.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
#include "RandBLAS/config.h"
#include "RandBLAS/base.hh"
#include "RandBLAS/util.hh"
#include "RandBLAS/dense_skops.hh"
using RandBLAS::RNGState;
using RandBLAS::DenseDistName;
#include "rng_common.hh"

#include <algorithm>
Expand All @@ -40,43 +42,102 @@ using RandBLAS::RNGState;
#include <set>
#include <vector>
#include <gtest/gtest.h>
#include <stdexcept>


class TestContinuous : public ::testing::Test
{
protected:

virtual void SetUp(){};

virtual void TearDown(){};
class TestContinuous : public ::testing::Test {
protected:

// This is really for distributions whose CDFs "F" are continuous and strictly increasing
// on the interval [a, b] where F(a) = 0 and F(b) = 1.

template <typename T>
static void kolmogorov_smirnov_tester(
// std::vector<int64_t> &samples, std::vector<float> &true_cdf, double critical_value
) {
// TODO: WRITE ME

// auto num_samples = (int) samples.size();
// auto N = (int64_t) true_cdf.size();
// std::vector<float> sample_cdf(N, 0.0);
// for (int64_t s : samples)
// sample_cdf[s] += 1;
// RandBLAS::util::weights_to_cdf(N, sample_cdf.data());
std::vector<T> &samples, double critical_value, DenseDistName dn
) {
auto F_true = [dn](T x) {
if (dn == DenseDistName::Gaussian) {
return RandBLAS_StatTests::standard_normal_cdf(x);
} else if (dn == DenseDistName::Uniform) {
return RandBLAS_StatTests::uniform_neg11_cdf(x);
} else {
std::string msg = "Unrecognized distributions name";
throw std::runtime_error(msg);
}
};
auto N = (int) samples.size();
/**
* Let L(x) = |F_empirical(x) - F_true(x)|. The KS test testatistic is
*
* ts = sup_{all x} L(x).
*
* Now set s = sorted(samples), and partition the real line into
*
* I_0 = (-infty, s[0 ]), ...
* I_1 = [s[0 ], s[1 ]), ...
* I_2 = [s[1 ], s[2 ]), ...
* I_{N-1} = [s[N-2], s[N-1]), ...
* I_N = [s[N-1], +infty).
*
* Then, provided F_true is continuous, we have
*
* sup{ L(x) : x in I_j } = max{
* |F_true(inf(I_j)) - j/N|, |F_true(sup(I_j)) - j/N|
* }
*
* for j = 0, ..., N.
*/
samples.push_back( std::numeric_limits<T>::infinity());
samples.push_back(-std::numeric_limits<T>::infinity());
std::sort(samples.begin(), samples.end(), [](T a, T b) {return (a < b);});
for (int64_t j = 0; j <= N; ++j) {
T temp1 = F_true(samples[j ]);
T temp2 = F_true(samples[j + 1]);
T empirical = ((T)j)/((T)N);
T val1 = std::abs(temp1 - empirical);
T val2 = std::abs(temp2 - empirical);
T supLx_on_Ij = std::max(val1, val2);
ASSERT_LE(supLx_on_Ij, critical_value)
<< "\nj = " << j << " of N = " << N
<< "\nF_true(inf(I_j)) = " << temp1
<< "\nF_true(sup(I_j)) = " << temp2
<< "\nI_j = [" << samples[j] << ", " << samples[j+1] << ")";
}
return;
}

// for (int i = 0; i < num_samples; ++i) {
// auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]);
// ASSERT_LT(diff, critical_value);
// }
template <typename T>
static void run(double significance, int64_t num_samples, DenseDistName dn, uint32_t seed) {
using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
auto critical_value = critical_value_rep_mutator(num_samples, significance);
RNGState state(seed);
std::vector<T> samples(num_samples, -1);
RandBLAS::fill_dense({num_samples, 1, dn, RandBLAS::MajorAxis::Long}, samples.data(), state);
kolmogorov_smirnov_tester(samples, critical_value, dn);
return;
}
};

template <typename t>
static void run_uneg11() {
TEST_F(TestContinuous, uneg11_ks_generous) {
double s = 1e-6;
run<double>(s, 100000, DenseDistName::Uniform, 0);
run<double>(s, 10000, DenseDistName::Uniform, 0);
run<double>(s, 1000, DenseDistName::Uniform, 0);
}

}

template <typename T>
static void run_gaussian() {
TEST_F(TestContinuous, uneg11_ks_moderate) {
double s = 1e-4;
run<float>(s, 100000, DenseDistName::Uniform, 0);
run<float>(s, 10000, DenseDistName::Uniform, 0);
run<float>(s, 1000, DenseDistName::Uniform, 0); // might fail?
}

}
};

TEST_F(TestContinuous, uneg11_ks_skeptical) {
double s = 1e-2;
run<float>(s, 100000, DenseDistName::Uniform, 0);
run<float>(s, 10000, DenseDistName::Uniform, 0);
run<float>(s, 1000, DenseDistName::Uniform, 0);
}
9 changes: 5 additions & 4 deletions test/test_basic_rng/test_discrete.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,16 @@ class TestSampleIndices : public ::testing::Test
static void index_set_kolmogorov_smirnov_tester(
std::vector<int64_t> &samples, std::vector<float> &true_cdf, double critical_value
) {
auto num_samples = (int) samples.size();
auto N = (int64_t) true_cdf.size();
std::vector<float> sample_cdf(N, 0.0);
for (int64_t s : samples)
sample_cdf[s] += 1;
RandBLAS::util::weights_to_cdf(N, sample_cdf.data());

for (int i = 0; i < num_samples; ++i) {
auto diff = (double) std::abs(sample_cdf[i] - true_cdf[i]);
for (int i = 0; i < N; ++i) {
float F_empirical = sample_cdf[i];
float F_true = true_cdf[i];
auto diff = (double) std::abs(F_empirical - F_true);
ASSERT_LT(diff, critical_value);
}
return;
Expand All @@ -94,7 +95,7 @@ class TestSampleIndices : public ::testing::Test
return;
}

static void test_iid_kolmogorov_smirnov(int64_t N, int exponent, double significance, int64_t num_samples, uint32_t seed) {
static void test_iid_kolmogorov_smirnov(int64_t N, float exponent, double significance, int64_t num_samples, uint32_t seed) {
using RandBLAS_StatTests::KolmogorovSmirnovConstants::critical_value_rep_mutator;
auto critical_value = critical_value_rep_mutator(num_samples, significance);

Expand Down

0 comments on commit 8d992a4

Please sign in to comment.