From 9195930c74c61138427cd48a0c4f9a1f8c01b3ba Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 3 Jun 2021 17:16:34 +0200
Subject: [PATCH 01/19] adding a unit test for metrics

---
 tests/metrics_test.py | 189 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 tests/metrics_test.py

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
new file mode 100644
index 000000000..4d7b5ab4f
--- /dev/null
+++ b/tests/metrics_test.py
@@ -0,0 +1,189 @@
+# This file is part of sbi, a toolkit for simulation-based inference. sbi is licensed
+# under the Affero General Public License v3, see <https://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+import numpy as np
+
+import torch
+from torch.distributions import MultivariateNormal as tmvn
+from sbi.utils.metrics import c2st
+
+from sklearn.model_selection import KFold, cross_val_score
+# from sklearn.neural_network import MLPClassifier
+from sklearn.ensemble import RandomForestClassifier
+from torch import Tensor
+
+
+def rv_c2st(
+    X: Tensor,
+    Y: Tensor,
+    seed: int = 1,
+    n_folds: int = 5,
+    scoring: str = "accuracy",
+    z_score: bool = True,
+    noise_scale: Optional[float] = None,
+        verbosity=0
+) -> Tensor:
+    """Return accuracy of classifier trained to distinguish samples from two distributions.
+
+    Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
+    used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
+    samples X and Y.
+    Args:
+        X: Samples from one distribution.
+        Y: Samples from another distribution.
+        seed: Seed for sklearn
+        n_folds: Number of folds
+        z_score: Z-scoring using X
+        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
+
+    References:
+        [1]: https://scikit-learn.org/stable/modules/cross_validation.html
+    """
+    if z_score:
+        X_mean = torch.mean(X, axis=0)
+        X_std = torch.std(X, axis=0)
+        X = (X - X_mean) / X_std
+        Y = (Y - X_mean) / X_std
+
+    if noise_scale is not None:
+        X += noise_scale * torch.randn(X.shape)
+        Y += noise_scale * torch.randn(Y.shape)
+
+    X = X.cpu().numpy()
+    Y = Y.cpu().numpy()
+
+    ndim = X.shape[1]
+
+    clf = RandomForestClassifier(
+        # n_estimators=100,
+     #   max_depth=None,
+        random_state=seed,
+    )
+
+    # clf = MLPClassifier(
+    #     activation="relu",
+    #     hidden_layer_sizes=(10 * ndim, 10 * ndim),
+    #     max_iter=1000,
+    #     solver="adam",
+    #     random_state=seed,
+    # )
+
+    #prepare data
+    data = np.concatenate((X, Y))
+    #labels
+    target = np.concatenate(
+        (
+            np.zeros((X.shape[0],)),
+            np.ones((Y.shape[0],)),
+        )
+    )
+
+    shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
+    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring,
+                             verbose=verbosity)
+
+    scores = np.asarray(np.mean(scores)).astype(np.float32)
+    return torch.from_numpy(np.atleast_1d(scores))
+
+
+def test_same_distributions_rv():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = xnormal.sample((nsamples,))
+
+    obs_c2st = rv_c2st(X, Y, verbosity=2)
+
+    assert obs_c2st != None
+    print(obs_c2st)
+
+
+def test_diff_distributions_rv():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    ynormal = tmvn(loc=20.*torch.ones(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
+
+    obs_c2st = rv_c2st(X, Y)
+
+    assert obs_c2st != None
+    print(obs_c2st)
+
+def test_distributions_overlap_by_two_sigma_rv():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    ynormal = tmvn(loc=1.*torch.ones(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
+
+    obs_c2st = rv_c2st(X, Y)
+
+    assert obs_c2st != None
+    print(obs_c2st)
+    assert .8 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+
+def test_same_distributions_default():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = xnormal.sample((nsamples,))
+
+    obs_c2st = c2st(X, Y)
+
+    assert obs_c2st != None
+    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
+
+
+def test_diff_distributions_default():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    ynormal = tmvn(loc=20.*torch.ones(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
+
+    obs_c2st = c2st(X, Y)
+
+    assert obs_c2st != None
+    print(obs_c2st)
+    assert .98 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+
+
+def test_distributions_overlap_by_two_sigma_default():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    ynormal = tmvn(loc=1.*torch.ones(5),covariance_matrix=torch.eye(5))
+
+    X = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
+
+    obs_c2st = c2st(X, Y)
+
+    assert obs_c2st != None
+    print(obs_c2st)
+    assert .8 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy

From a3c0a7d23cc16631bd749154b773dd7f0e458652 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 3 Jun 2021 17:17:37 +0200
Subject: [PATCH 02/19] more explicit docstring

---
 sbi/utils/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index 82115b8a6..eb1a501fd 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -30,7 +30,7 @@ def c2st(
         seed: Seed for sklearn
         n_folds: Number of folds
         z_score: Z-scoring using X
-        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples
+        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
 
     References:
         [1]: https://scikit-learn.org/stable/modules/cross_validation.html

From a0e3435b6ba1eb72bf460dbf46a05a751aba2c99 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 3 Jun 2021 17:27:47 +0200
Subject: [PATCH 03/19] document clf_kwargs that can be provided by user

---
 tests/metrics_test.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index 4d7b5ab4f..a9d614a42 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -15,7 +15,7 @@
 from torch import Tensor
 
 
-def rv_c2st(
+def alt_c2st(
     X: Tensor,
     Y: Tensor,
     seed: int = 1,
@@ -23,9 +23,13 @@ def rv_c2st(
     scoring: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
-        verbosity=0
+        verbosity: int = 0,
+    **clf_kwargs
+
 ) -> Tensor:
-    """Return accuracy of classifier trained to distinguish samples from two distributions.
+    """
+    [Alternative implementation of c2st]
+    Return accuracy of classifier trained to distinguish samples from two distributions.
 
     Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
     used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
@@ -37,6 +41,9 @@ def rv_c2st(
         n_folds: Number of folds
         z_score: Z-scoring using X
         noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
+        verbosity: control the verbosity of sklearn.model_selection.cross_val_score
+        clf_kwargs: key-word arguments to sklearn.ensemble.RandomForestClassifier
+
 
     References:
         [1]: https://scikit-learn.org/stable/modules/cross_validation.html
@@ -57,18 +64,10 @@ def rv_c2st(
     ndim = X.shape[1]
 
     clf = RandomForestClassifier(
-        # n_estimators=100,
-     #   max_depth=None,
         random_state=seed,
+        *clf_kwargs
     )
 
-    # clf = MLPClassifier(
-    #     activation="relu",
-    #     hidden_layer_sizes=(10 * ndim, 10 * ndim),
-    #     max_iter=1000,
-    #     solver="adam",
-    #     random_state=seed,
-    # )
 
     #prepare data
     data = np.concatenate((X, Y))
@@ -88,7 +87,7 @@ def rv_c2st(
     return torch.from_numpy(np.atleast_1d(scores))
 
 
-def test_same_distributions_rv():
+def test_same_distributions_alt():
 
     ndim = 5
     nsamples = 4048
@@ -98,13 +97,13 @@ def test_same_distributions_rv():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = rv_c2st(X, Y, verbosity=2)
+    obs_c2st = alt_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
 
 
-def test_diff_distributions_rv():
+def test_diff_distributions_alt():
 
     ndim = 5
     nsamples = 4048
@@ -115,12 +114,12 @@ def test_diff_distributions_rv():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = rv_c2st(X, Y)
+    obs_c2st = alt_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
 
-def test_distributions_overlap_by_two_sigma_rv():
+def test_distributions_overlap_by_two_sigma_alt():
 
     ndim = 5
     nsamples = 4048
@@ -131,7 +130,7 @@ def test_distributions_overlap_by_two_sigma_rv():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = rv_c2st(X, Y)
+    obs_c2st = alt_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)

From a3992228fa7feba1f12036bd04a89379e25cb89d Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 15 Jul 2021 17:06:15 +0200
Subject: [PATCH 04/19] demonstrated flexible API change for c2st

---
 tests/metrics_test.py | 72 +++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index a9d614a42..c64c258ca 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -10,8 +10,9 @@
 from sbi.utils.metrics import c2st
 
 from sklearn.model_selection import KFold, cross_val_score
-# from sklearn.neural_network import MLPClassifier
+from sklearn.neural_network import MLPClassifier
 from sklearn.ensemble import RandomForestClassifier
+
 from torch import Tensor
 
 
@@ -23,8 +24,9 @@ def alt_c2st(
     scoring: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
-        verbosity: int = 0,
-    **clf_kwargs
+    verbosity: int = 0,
+    clf_class = RandomForestClassifier,
+    clf_kwargs = {}
 
 ) -> Tensor:
     """
@@ -34,6 +36,7 @@ def alt_c2st(
     Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
     used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
     samples X and Y.
+
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
@@ -42,8 +45,10 @@ def alt_c2st(
         z_score: Z-scoring using X
         noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
         verbosity: control the verbosity of sklearn.model_selection.cross_val_score
-        clf_kwargs: key-word arguments to sklearn.ensemble.RandomForestClassifier
+        clf_class: a scikit-learn classifier class
+        clf_kwargs: key-value arguments dictuinary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
 
+    Example:
 
     References:
         [1]: https://scikit-learn.org/stable/modules/cross_validation.html
@@ -61,14 +66,11 @@ def alt_c2st(
     X = X.cpu().numpy()
     Y = Y.cpu().numpy()
 
-    ndim = X.shape[1]
-
-    clf = RandomForestClassifier(
+    clf = clf_class(
         random_state=seed,
-        *clf_kwargs
+        **clf_kwargs
     )
 
-
     #prepare data
     data = np.concatenate((X, Y))
     #labels
@@ -92,7 +94,7 @@ def test_same_distributions_alt():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
@@ -100,6 +102,7 @@ def test_same_distributions_alt():
     obs_c2st = alt_c2st(X, Y)
 
     assert obs_c2st != None
+    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
     print(obs_c2st)
 
 
@@ -108,8 +111,8 @@ def test_diff_distributions_alt():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
-    ynormal = tmvn(loc=20.*torch.ones(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
@@ -117,15 +120,17 @@ def test_diff_distributions_alt():
     obs_c2st = alt_c2st(X, Y)
 
     assert obs_c2st != None
+    assert .98 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
     print(obs_c2st)
 
+
 def test_distributions_overlap_by_two_sigma_alt():
 
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
-    ynormal = tmvn(loc=1.*torch.ones(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=1.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
@@ -136,12 +141,13 @@ def test_distributions_overlap_by_two_sigma_alt():
     print(obs_c2st)
     assert .8 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
 
+
 def test_same_distributions_default():
 
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
@@ -152,13 +158,41 @@ def test_same_distributions_default():
     assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
 
 
+def test_same_distributions_default_flexible_alt():
+
+    ndim = 5
+    nsamples = 4048
+
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+
+    X = xnormal.sample((nsamples,))
+    Y = xnormal.sample((nsamples,))
+
+    obs_c2st = c2st(X, Y)
+
+    assert obs_c2st != None
+    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
+
+    clf_class = MLPClassifier
+    clf_kwargs = {"activation": "relu",
+                  "hidden_layer_sizes": (10 * X.shape[1], 10 * X.shape[1]),
+                  "max_iter": 1000,
+                  "solver": "adam"}
+
+    obs2_c2st = alt_c2st(X, Y, clf_class=clf_class, clf_kwargs=clf_kwargs)
+
+    assert obs2_c2st != None
+    assert .49 < obs2_c2st[0] < .51 #only by chance we differentiate the 2 samples
+    assert np.allclose(obs2_c2st, obs_c2st)
+
+
 def test_diff_distributions_default():
 
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
-    ynormal = tmvn(loc=20.*torch.ones(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
@@ -175,8 +209,8 @@ def test_distributions_overlap_by_two_sigma_default():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(5),covariance_matrix=torch.eye(5))
-    ynormal = tmvn(loc=1.*torch.ones(5),covariance_matrix=torch.eye(5))
+    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=1.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))

From 4f6d1e738efe77c4e739e4d76471c2be1d7c3e89 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 15 Jul 2021 17:14:39 +0200
Subject: [PATCH 05/19] fixing the seed for comparing both implementations

---
 tests/metrics_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index c64c258ca..ca038cdc7 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -168,7 +168,7 @@ def test_same_distributions_default_flexible_alt():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = c2st(X, Y, seed=42)
 
     assert obs_c2st != None
     assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
@@ -179,7 +179,7 @@ def test_same_distributions_default_flexible_alt():
                   "max_iter": 1000,
                   "solver": "adam"}
 
-    obs2_c2st = alt_c2st(X, Y, clf_class=clf_class, clf_kwargs=clf_kwargs)
+    obs2_c2st = alt_c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
 
     assert obs2_c2st != None
     assert .49 < obs2_c2st[0] < .51 #only by chance we differentiate the 2 samples

From 3550241459b962941c77a5472f30682f6a9a61b6 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Mon, 24 Jan 2022 10:59:06 +0100
Subject: [PATCH 06/19] swapped in RF based c2st into main package

---
 sbi/utils/metrics.py  |  59 ++++++++++++++-------
 tests/metrics_test.py | 119 ++++++++++++++++++++----------------------
 2 files changed, 96 insertions(+), 82 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index 3eeffa66c..a8910fda7 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 import torch
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import KFold, cross_val_score
-from sklearn.neural_network import MLPClassifier
 from torch import Tensor
 
 from sbi.utils import tensor2numpy
@@ -20,12 +20,26 @@ def c2st(
     scoring: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
+    verbosity: int = 0,
+    clf_class=RandomForestClassifier,
+    clf_kwargs={},
 ) -> Tensor:
-    """Return accuracy of classifier trained to distinguish samples from two distributions.
+    """
+    Return accuracy of classifier trained to distinguish samples from supposedly
+    two distributions <X> and <Y>. For details on the method, see [1].
+    If the returned accuracy is 0.5, <X> and <Y> are considered to be from the
+    same generating PDF, i.e. they can not be differentiated.
+    If the returned accuracy is around 1., <X> and <Y> are considered to be from
+    two different generating PDFs.
+
+    Trains classifiers with N-fold cross-validation [2]. By default, a `RandomForestClassifier`
+    by scikit-learn is used. This can be adopted using <clf_class> and
+    <clf_kwargs> as in:
+
+    ```
+    clf = clf_class(random_state=seed, **clf_kwargs)
+    ```
 
-    Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
-    used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
-    samples X and Y.
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
@@ -33,9 +47,15 @@ def c2st(
         n_folds: Number of folds
         z_score: Z-scoring using X
         noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
+        verbosity: control the verbosity of sklearn.model_selection.cross_val_score
+        clf_class: a scikit-learn classifier class
+        clf_kwargs: key-value arguments dictuinary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
+
+    Example:
 
     References:
-        [1]: https://scikit-learn.org/stable/modules/cross_validation.html
+        [1]: http://arxiv.org/abs/1610.06545
+        [2]: https://scikit-learn.org/stable/modules/cross_validation.html
     """
     if z_score:
         X_mean = torch.mean(X, axis=0)
@@ -47,24 +67,25 @@ def c2st(
         X += noise_scale * torch.randn(X.shape)
         Y += noise_scale * torch.randn(Y.shape)
 
-    X = tensor2numpy(X)
-    Y = tensor2numpy(Y)
+    X = X.cpu().numpy()
+    Y = Y.cpu().numpy()
 
-    ndim = X.shape[1]
-
-    clf = MLPClassifier(
-        activation="relu",
-        hidden_layer_sizes=(10 * ndim, 10 * ndim),
-        max_iter=1000,
-        solver="adam",
-        random_state=seed,
-    )
+    clf = clf_class(random_state=seed, **clf_kwargs)
 
+    # prepare data
     data = np.concatenate((X, Y))
-    target = np.concatenate((np.zeros((X.shape[0],)), np.ones((Y.shape[0],))))
+    # labels
+    target = np.concatenate(
+        (
+            np.zeros((X.shape[0],)),
+            np.ones((Y.shape[0],)),
+        )
+    )
 
     shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
-    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)
+    scores = cross_val_score(
+        clf, data, target, cv=shuffle, scoring=scoring, verbose=verbosity
+    )
 
     scores = np.asarray(np.mean(scores)).astype(np.float32)
     return torch.from_numpy(np.atleast_1d(scores))
diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index ca038cdc7..aa218c778 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -4,19 +4,17 @@
 from __future__ import annotations
 
 import numpy as np
-
 import torch
-from torch.distributions import MultivariateNormal as tmvn
-from sbi.utils.metrics import c2st
-
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import KFold, cross_val_score
 from sklearn.neural_network import MLPClassifier
-from sklearn.ensemble import RandomForestClassifier
-
 from torch import Tensor
+from torch.distributions import MultivariateNormal as tmvn
 
+from sbi.utils.metrics import c2st
 
-def alt_c2st(
+
+def old_c2st(
     X: Tensor,
     Y: Tensor,
     seed: int = 1,
@@ -24,19 +22,12 @@ def alt_c2st(
     scoring: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
-    verbosity: int = 0,
-    clf_class = RandomForestClassifier,
-    clf_kwargs = {}
-
 ) -> Tensor:
-    """
-    [Alternative implementation of c2st]
-    Return accuracy of classifier trained to distinguish samples from two distributions.
+    """Return accuracy of classifier trained to distinguish samples from two distributions.
 
     Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
     used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
     samples X and Y.
-
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
@@ -44,11 +35,6 @@ def alt_c2st(
         n_folds: Number of folds
         z_score: Z-scoring using X
         noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
-        verbosity: control the verbosity of sklearn.model_selection.cross_val_score
-        clf_class: a scikit-learn classifier class
-        clf_kwargs: key-value arguments dictuinary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
-
-    Example:
 
     References:
         [1]: https://scikit-learn.org/stable/modules/cross_validation.html
@@ -63,27 +49,24 @@ def alt_c2st(
         X += noise_scale * torch.randn(X.shape)
         Y += noise_scale * torch.randn(Y.shape)
 
-    X = X.cpu().numpy()
-    Y = Y.cpu().numpy()
+    X = tensor2numpy(X)
+    Y = tensor2numpy(Y)
 
-    clf = clf_class(
+    ndim = X.shape[1]
+
+    clf = MLPClassifier(
+        activation="relu",
+        hidden_layer_sizes=(10 * ndim, 10 * ndim),
+        max_iter=1000,
+        solver="adam",
         random_state=seed,
-        **clf_kwargs
     )
 
-    #prepare data
     data = np.concatenate((X, Y))
-    #labels
-    target = np.concatenate(
-        (
-            np.zeros((X.shape[0],)),
-            np.ones((Y.shape[0],)),
-        )
-    )
+    target = np.concatenate((np.zeros((X.shape[0],)), np.ones((Y.shape[0],))))
 
     shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
-    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring,
-                             verbose=verbosity)
+    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)
 
     scores = np.asarray(np.mean(scores)).astype(np.float32)
     return torch.from_numpy(np.atleast_1d(scores))
@@ -94,15 +77,15 @@ def test_same_distributions_alt():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = alt_c2st(X, Y)
+    obs_c2st = new_c2st(X, Y)
 
     assert obs_c2st != None
-    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
+    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
     print(obs_c2st)
 
 
@@ -111,16 +94,18 @@ def test_diff_distributions_alt():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = alt_c2st(X, Y)
+    obs_c2st = new_c2st(X, Y)
 
     assert obs_c2st != None
-    assert .98 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+    assert (
+        0.98 < obs_c2st[0]
+    )  # distributions do not overlap, classifiers label with high accuracy
     print(obs_c2st)
 
 
@@ -129,17 +114,19 @@ def test_distributions_overlap_by_two_sigma_alt():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=1.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=1.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = alt_c2st(X, Y)
+    obs_c2st = new_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
-    assert .8 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+    assert (
+        0.8 < obs_c2st[0]
+    )  # distributions do not overlap, classifiers label with high accuracy
 
 
 def test_same_distributions_default():
@@ -152,10 +139,10 @@ def test_same_distributions_default():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = old_c2st(X, Y)
 
     assert obs_c2st != None
-    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
+    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
 
 
 def test_same_distributions_default_flexible_alt():
@@ -168,21 +155,23 @@ def test_same_distributions_default_flexible_alt():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y, seed=42)
+    obs_c2st = old_c2st(X, Y, seed=42)
 
     assert obs_c2st != None
-    assert .49 < obs_c2st[0] < .51 #only by chance we differentiate the 2 samples
+    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
 
     clf_class = MLPClassifier
-    clf_kwargs = {"activation": "relu",
-                  "hidden_layer_sizes": (10 * X.shape[1], 10 * X.shape[1]),
-                  "max_iter": 1000,
-                  "solver": "adam"}
+    clf_kwargs = {
+        "activation": "relu",
+        "hidden_layer_sizes": (10 * X.shape[1], 10 * X.shape[1]),
+        "max_iter": 1000,
+        "solver": "adam",
+    }
 
-    obs2_c2st = alt_c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
+    obs2_c2st = new_c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
 
     assert obs2_c2st != None
-    assert .49 < obs2_c2st[0] < .51 #only by chance we differentiate the 2 samples
+    assert 0.49 < obs2_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
     assert np.allclose(obs2_c2st, obs_c2st)
 
 
@@ -191,17 +180,19 @@ def test_diff_distributions_default():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = old_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
-    assert .98 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+    assert (
+        0.98 < obs_c2st[0]
+    )  # distributions do not overlap, classifiers label with high accuracy
 
 
 def test_distributions_overlap_by_two_sigma_default():
@@ -209,14 +200,16 @@ def test_distributions_overlap_by_two_sigma_default():
     ndim = 5
     nsamples = 4048
 
-    xnormal = tmvn(loc=torch.zeros(ndim),covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=1.*torch.ones(ndim),covariance_matrix=torch.eye(ndim))
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=1.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = old_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
-    assert .8 < obs_c2st[0] #distributions do not overlap, classifiers label with high accuracy
+    assert (
+        0.8 < obs_c2st[0]
+    )  # distributions do not overlap, classifiers label with high accuracy

From 527a0dd423d02a73a6977085543f24d746ecce36 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 12:08:24 +0100
Subject: [PATCH 07/19] revised setup of unit tests

---
 tests/metrics_test.py | 139 +++++++++++++++++++++++-------------------
 1 file changed, 76 insertions(+), 63 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index aa218c778..b9868e63d 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -13,68 +13,47 @@
 
 from sbi.utils.metrics import c2st
 
+## c2st related
+## for a small study about c2st see https://github.com/psteinb/c2st/
 
-def old_c2st(
-    X: Tensor,
-    Y: Tensor,
+
+def nn_c2st(
+    X: np.ndarray,
+    Y: np.ndarray,
     seed: int = 1,
     n_folds: int = 5,
     scoring: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
-) -> Tensor:
-    """Return accuracy of classifier trained to distinguish samples from two distributions.
-
-    Trains classifiers with N-fold cross-validation [1]. Scikit learn MLPClassifier are
-    used, with 2 hidden layers of 10x dim each, where dim is the dimensionality of the
-    samples X and Y.
-    Args:
-        X: Samples from one distribution.
-        Y: Samples from another distribution.
-        seed: Seed for sklearn
-        n_folds: Number of folds
-        z_score: Z-scoring using X
-        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
-
-    References:
-        [1]: https://scikit-learn.org/stable/modules/cross_validation.html
-    """
-    if z_score:
-        X_mean = torch.mean(X, axis=0)
-        X_std = torch.std(X, axis=0)
-        X = (X - X_mean) / X_std
-        Y = (Y - X_mean) / X_std
-
-    if noise_scale is not None:
-        X += noise_scale * torch.randn(X.shape)
-        Y += noise_scale * torch.randn(Y.shape)
-
-    X = tensor2numpy(X)
-    Y = tensor2numpy(Y)
+    verbosity: int = 0,
+) -> np.ndarray:
 
     ndim = X.shape[1]
+    clf_class = MLPClassifier
+    clf_kwargs = {
+        "activation": "relu",
+        "hidden_layer_sizes": (10 * ndim, 10 * ndim),
+        "max_iter": 1000,
+        "solver": "adam",
+    }
 
-    clf = MLPClassifier(
-        activation="relu",
-        hidden_layer_sizes=(10 * ndim, 10 * ndim),
-        max_iter=1000,
-        solver="adam",
-        random_state=seed,
+    return c2st(
+        X,
+        Y,
+        seed,
+        n_folds,
+        scoring,
+        z_score,
+        noise_scale,
+        verbosity,
+        clf_class,
+        clf_kwargs,
     )
 
-    data = np.concatenate((X, Y))
-    target = np.concatenate((np.zeros((X.shape[0],)), np.ones((Y.shape[0],))))
-
-    shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
-    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)
-
-    scores = np.asarray(np.mean(scores)).astype(np.float32)
-    return torch.from_numpy(np.atleast_1d(scores))
-
 
 def test_same_distributions_alt():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -82,7 +61,7 @@ def test_same_distributions_alt():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = new_c2st(X, Y)
+    obs_c2st = c2st(X, Y)
 
     assert obs_c2st != None
     assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
@@ -91,7 +70,7 @@ def test_same_distributions_alt():
 
 def test_diff_distributions_alt():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -100,7 +79,7 @@ def test_diff_distributions_alt():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = new_c2st(X, Y)
+    obs_c2st = c2st(X, Y)
 
     assert obs_c2st != None
     assert (
@@ -109,9 +88,9 @@ def test_diff_distributions_alt():
     print(obs_c2st)
 
 
-def test_distributions_overlap_by_two_sigma_alt():
+def test_distributions_overlap_by_one_sigma_alt():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -120,7 +99,7 @@ def test_distributions_overlap_by_two_sigma_alt():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = new_c2st(X, Y)
+    obs_c2st = c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
@@ -129,9 +108,10 @@ def test_distributions_overlap_by_two_sigma_alt():
     )  # distributions do not overlap, classifiers label with high accuracy
 
 
+@pytest.mark.slow
 def test_same_distributions_default():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -139,15 +119,16 @@ def test_same_distributions_default():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = old_c2st(X, Y)
+    obs_c2st = nn_c2st(X, Y)
 
     assert obs_c2st != None
     assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
 
 
+@pytest.mark.slow
 def test_same_distributions_default_flexible_alt():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -155,7 +136,7 @@ def test_same_distributions_default_flexible_alt():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = old_c2st(X, Y, seed=42)
+    obs_c2st = nn_c2st(X, Y, seed=42)
 
     assert obs_c2st != None
     assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
@@ -168,16 +149,17 @@ def test_same_distributions_default_flexible_alt():
         "solver": "adam",
     }
 
-    obs2_c2st = new_c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
+    obs2_c2st = c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
 
     assert obs2_c2st != None
     assert 0.49 < obs2_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
     assert np.allclose(obs2_c2st, obs_c2st)
 
 
+@pytest.mark.slow
 def test_diff_distributions_default():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -186,7 +168,7 @@ def test_diff_distributions_default():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = old_c2st(X, Y)
+    obs_c2st = nn_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
@@ -195,9 +177,10 @@ def test_diff_distributions_default():
     )  # distributions do not overlap, classifiers label with high accuracy
 
 
+@pytest.mark.slow
 def test_distributions_overlap_by_two_sigma_default():
 
-    ndim = 5
+    ndim = 10
     nsamples = 4048
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
@@ -206,10 +189,40 @@ def test_distributions_overlap_by_two_sigma_default():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = old_c2st(X, Y)
+    obs_c2st = nn_c2st(X, Y)
 
     assert obs_c2st != None
     print(obs_c2st)
     assert (
         0.8 < obs_c2st[0]
     )  # distributions do not overlap, classifiers label with high accuracy
+
+
+def test_with_different_classifyer():
+
+    ndim = 10
+    nsamples = 256
+
+    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=10 + torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+
+    X = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
+
+    exp_c2st = c2st(X, Y)
+    assert 0.9 < exp_c2st[0]
+
+    clf_class_ = MLPClassifier
+    clf_kwargs_ = {
+        "activation": "relu",
+        "hidden_layer_sizes": (10 * ndim, 5 * ndim),
+        "max_iter": 1000,
+        "solver": "adam",
+    }
+
+    obs_c2st = c2st(X, Y, clf_class=clf_class_, clf_kwargs=clf_kwargs_)
+
+    assert obs_c2st != None
+    assert obs_c2st[0] < 1
+    assert 0.9 < obs_c2st[0]
+    assert torch.allclose(exp_c2st, obs_c2st, rtol=0.1)

From eef25d83be46129d2cd5dc3b647e308e10fbc447 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 12:08:33 +0100
Subject: [PATCH 08/19] more details in the docstring

---
 sbi/utils/metrics.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index a8910fda7..4873977bf 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -26,36 +26,47 @@ def c2st(
 ) -> Tensor:
     """
     Return accuracy of classifier trained to distinguish samples from supposedly
-    two distributions <X> and <Y>. For details on the method, see [1].
+    two distributions <X> and <Y>. For details on the method, see [1,2].
     If the returned accuracy is 0.5, <X> and <Y> are considered to be from the
     same generating PDF, i.e. they can not be differentiated.
     If the returned accuracy is around 1., <X> and <Y> are considered to be from
     two different generating PDFs.
 
-    Trains classifiers with N-fold cross-validation [2]. By default, a `RandomForestClassifier`
-    by scikit-learn is used. This can be adopted using <clf_class> and
-    <clf_kwargs> as in:
+    Training of the classifier with N-fold cross-validation [3] using sklearn.
+    By default, a `RandomForestClassifier` by from `sklearn.ensemble` is used.
+    This can be changed using <clf_class>.
 
-    ```
+    ``` py
     clf = clf_class(random_state=seed, **clf_kwargs)
+    #...
+    scores = cross_val_score(
+        clf, data, target, cv=shuffle, scoring=scoring, verbose=verbosity
+    )
     ```
+    Further configuration of the classifier can be performed using <clf_kwargs>.
 
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
-        seed: Seed for sklearn
-        n_folds: Number of folds
-        z_score: Z-scoring using X
+        seed: Seed for the sklearn classifier and the KFold cross-validation
+        n_folds: Number of folds to use
+        z_score: Z-scoring using X, i.e. mean and std deviation of X is used to normalize Y using (Y - mean)/std
         noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
         verbosity: control the verbosity of sklearn.model_selection.cross_val_score
         clf_class: a scikit-learn classifier class
-        clf_kwargs: key-value arguments dictuinary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
+        clf_kwargs: key-value arguments dictionary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
 
     Example:
+    ``` py
+    > c2st(X,Y)
+    [0.51904464]
+    #X and Y likely come from the same PDF or ensemble
+    ```
 
     References:
         [1]: http://arxiv.org/abs/1610.06545
-        [2]: https://scikit-learn.org/stable/modules/cross_validation.html
+        [2]: https://www.osti.gov/biblio/826696/
+        [3]: https://scikit-learn.org/stable/modules/cross_validation.html
     """
     if z_score:
         X_mean = torch.mean(X, axis=0)

From 086c842bebc96a05924b6bf3fe436c3fcb17947f Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 12:19:26 +0100
Subject: [PATCH 09/19] added return value description

---
 sbi/utils/metrics.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index 4873977bf..73a139b83 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -56,6 +56,10 @@ def c2st(
         clf_class: a scikit-learn classifier class
         clf_kwargs: key-value arguments dictionary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
 
+    Return:
+        torch.tensor offering the mean accuracy score over the test sets
+        from cross-validation
+
     Example:
     ``` py
     > c2st(X,Y)

From 4ec23922fdfe96e5010fe73ad3112c7e5a49d0cb Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 14:42:45 +0100
Subject: [PATCH 10/19] adding missing import

---
 tests/metrics_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index b9868e63d..38eedfbdd 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import numpy as np
+import pytest
 import torch
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import KFold, cross_val_score
@@ -13,8 +14,8 @@
 
 from sbi.utils.metrics import c2st
 
-## c2st related
-## for a small study about c2st see https://github.com/psteinb/c2st/
+## c2st related:
+## for a study about c2st see https://github.com/psteinb/c2st/
 
 
 def nn_c2st(

From d47acfb2e21e0fdab1f70c9da12ab76c6c21ac65 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 14:54:41 +0100
Subject: [PATCH 11/19] help black format the code better

---
 sbi/utils/metrics.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index 73a139b83..802a4385a 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -90,12 +90,7 @@ def c2st(
     # prepare data
     data = np.concatenate((X, Y))
     # labels
-    target = np.concatenate(
-        (
-            np.zeros((X.shape[0],)),
-            np.ones((Y.shape[0],)),
-        )
-    )
+    target = np.concatenate((np.zeros((X.shape[0],)), np.ones((Y.shape[0],))))
 
     shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
     scores = cross_val_score(

From 9003bb907f59b32f474b8d617754ce56e34164cb Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 15:25:07 +0100
Subject: [PATCH 12/19] reducing the runtime of tests

- reducing the sample count based on studies in
github.com/psteinb/c2st
- expanding the bounds for assert statements to comply to
uncertainties obtained there
- renaming tests to better match their scope
---
 tests/metrics_test.py | 66 +++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 44 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index 38eedfbdd..959f3523f 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -52,10 +52,10 @@ def nn_c2st(
     )
 
 
-def test_same_distributions_alt():
+def test_same_distributions():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
 
@@ -65,14 +65,13 @@ def test_same_distributions_alt():
     obs_c2st = c2st(X, Y)
 
     assert obs_c2st != None
-    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
-    print(obs_c2st)
+    assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
 
 
-def test_diff_distributions_alt():
+def test_diff_distributions():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
     ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
@@ -89,13 +88,13 @@ def test_diff_distributions_alt():
     print(obs_c2st)
 
 
-def test_distributions_overlap_by_one_sigma_alt():
+def test_distributions_overlap_by_one_sigma():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=1.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
@@ -105,15 +104,15 @@ def test_distributions_overlap_by_one_sigma_alt():
     assert obs_c2st != None
     print(obs_c2st)
     assert (
-        0.8 < obs_c2st[0]
+        0.85 < obs_c2st[0]
     )  # distributions do not overlap, classifiers label with high accuracy
 
 
 @pytest.mark.slow
-def test_same_distributions_default():
+def test_same_distributions_nn():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
 
@@ -123,24 +122,25 @@ def test_same_distributions_default():
     obs_c2st = nn_c2st(X, Y)
 
     assert obs_c2st != None
-    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
+    assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
 
 
 @pytest.mark.slow
-def test_same_distributions_default_flexible_alt():
+def test_diff_distributions_flexible():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
-    Y = xnormal.sample((nsamples,))
+    Y = ynormal.sample((nsamples,))
 
     obs_c2st = nn_c2st(X, Y, seed=42)
 
     assert obs_c2st != None
-    assert 0.49 < obs_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
+    assert 0.95 < obs_c2st[0]
 
     clf_class = MLPClassifier
     clf_kwargs = {
@@ -153,39 +153,18 @@ def test_same_distributions_default_flexible_alt():
     obs2_c2st = c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
 
     assert obs2_c2st != None
-    assert 0.49 < obs2_c2st[0] < 0.51  # only by chance we differentiate the 2 samples
+    assert 0.95 < obs2_c2st[0]  # only by chance we differentiate the 2 samples
     assert np.allclose(obs2_c2st, obs_c2st)
 
 
 @pytest.mark.slow
-def test_diff_distributions_default():
-
-    ndim = 10
-    nsamples = 4048
-
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
-
-    X = xnormal.sample((nsamples,))
-    Y = ynormal.sample((nsamples,))
-
-    obs_c2st = nn_c2st(X, Y)
-
-    assert obs_c2st != None
-    print(obs_c2st)
-    assert (
-        0.98 < obs_c2st[0]
-    )  # distributions do not overlap, classifiers label with high accuracy
-
-
-@pytest.mark.slow
-def test_distributions_overlap_by_two_sigma_default():
+def test_distributions_overlap_by_two_sigma_mlp():
 
     ndim = 10
-    nsamples = 4048
+    nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=1.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
@@ -199,7 +178,7 @@ def test_distributions_overlap_by_two_sigma_default():
     )  # distributions do not overlap, classifiers label with high accuracy
 
 
-def test_with_different_classifyer():
+def test_interface_with_different_classifyer():
 
     ndim = 10
     nsamples = 256
@@ -224,6 +203,5 @@ def test_with_different_classifyer():
     obs_c2st = c2st(X, Y, clf_class=clf_class_, clf_kwargs=clf_kwargs_)
 
     assert obs_c2st != None
-    assert obs_c2st[0] < 1
     assert 0.9 < obs_c2st[0]
     assert torch.allclose(exp_c2st, obs_c2st, rtol=0.1)

From 4f0976a313c4751d950879a7200da18526468e71 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 15:44:21 +0100
Subject: [PATCH 13/19] constistent naming scheme and removing one more test

---
 tests/metrics_test.py | 37 ++++---------------------------------
 1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index 959f3523f..5b2c51023 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -88,7 +88,7 @@ def test_diff_distributions():
     print(obs_c2st)
 
 
-def test_distributions_overlap_by_one_sigma():
+def test_onesigma_apart_distributions():
 
     ndim = 10
     nsamples = 1024
@@ -109,7 +109,7 @@ def test_distributions_overlap_by_one_sigma():
 
 
 @pytest.mark.slow
-def test_same_distributions_nn():
+def test_same_distributions_mlp():
 
     ndim = 10
     nsamples = 1024
@@ -126,7 +126,7 @@ def test_same_distributions_nn():
 
 
 @pytest.mark.slow
-def test_diff_distributions_flexible():
+def test_diff_distributions_flexible_mlp():
 
     ndim = 10
     nsamples = 1024
@@ -158,7 +158,7 @@ def test_diff_distributions_flexible():
 
 
 @pytest.mark.slow
-def test_distributions_overlap_by_two_sigma_mlp():
+def test_onesigma_apart_distributions_mlp():
 
     ndim = 10
     nsamples = 1024
@@ -176,32 +176,3 @@ def test_distributions_overlap_by_two_sigma_mlp():
     assert (
         0.8 < obs_c2st[0]
     )  # distributions do not overlap, classifiers label with high accuracy
-
-
-def test_interface_with_different_classifyer():
-
-    ndim = 10
-    nsamples = 256
-
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=10 + torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-
-    X = xnormal.sample((nsamples,))
-    Y = ynormal.sample((nsamples,))
-
-    exp_c2st = c2st(X, Y)
-    assert 0.9 < exp_c2st[0]
-
-    clf_class_ = MLPClassifier
-    clf_kwargs_ = {
-        "activation": "relu",
-        "hidden_layer_sizes": (10 * ndim, 5 * ndim),
-        "max_iter": 1000,
-        "solver": "adam",
-    }
-
-    obs_c2st = c2st(X, Y, clf_class=clf_class_, clf_kwargs=clf_kwargs_)
-
-    assert obs_c2st != None
-    assert 0.9 < obs_c2st[0]
-    assert torch.allclose(exp_c2st, obs_c2st, rtol=0.1)

From 3e5f7d68bb9686ae11ef9305e54c36d496128bf7 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 15:49:41 +0100
Subject: [PATCH 14/19] seeding all test cases

---
 tests/metrics_test.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index 5b2c51023..d003d5953 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -52,7 +52,7 @@ def nn_c2st(
     )
 
 
-def test_same_distributions():
+def test_same_distributions(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -62,13 +62,13 @@ def test_same_distributions():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
 
 
-def test_diff_distributions():
+def test_diff_distributions(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -79,7 +79,7 @@ def test_diff_distributions():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     assert (
@@ -88,7 +88,7 @@ def test_diff_distributions():
     print(obs_c2st)
 
 
-def test_onesigma_apart_distributions():
+def test_onesigma_apart_distributions(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -99,7 +99,7 @@ def test_onesigma_apart_distributions():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y)
+    obs_c2st = c2st(X, Y, seed=set_seed))
 
     assert obs_c2st != None
     print(obs_c2st)
@@ -109,7 +109,7 @@ def test_onesigma_apart_distributions():
 
 
 @pytest.mark.slow
-def test_same_distributions_mlp():
+def test_same_distributions_mlp(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -119,14 +119,14 @@ def test_same_distributions_mlp():
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y)
+    obs_c2st = nn_c2st(X, Y, seed=set_seed))
 
     assert obs_c2st != None
     assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
 
 
 @pytest.mark.slow
-def test_diff_distributions_flexible_mlp():
+def test_diff_distributions_flexible_mlp(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -137,7 +137,7 @@ def test_diff_distributions_flexible_mlp():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=42)
+    obs_c2st = nn_c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     assert 0.95 < obs_c2st[0]
@@ -150,7 +150,7 @@ def test_diff_distributions_flexible_mlp():
         "solver": "adam",
     }
 
-    obs2_c2st = c2st(X, Y, seed=42, clf_class=clf_class, clf_kwargs=clf_kwargs)
+    obs2_c2st = c2st(X, Y, seed=set_seed, clf_class=clf_class, clf_kwargs=clf_kwargs)
 
     assert obs2_c2st != None
     assert 0.95 < obs2_c2st[0]  # only by chance we differentiate the 2 samples
@@ -158,7 +158,7 @@ def test_diff_distributions_flexible_mlp():
 
 
 @pytest.mark.slow
-def test_onesigma_apart_distributions_mlp():
+def test_onesigma_apart_distributions_mlp(set_seed):
 
     ndim = 10
     nsamples = 1024
@@ -169,7 +169,7 @@ def test_onesigma_apart_distributions_mlp():
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y)
+    obs_c2st = nn_c2st(X, Y, seed=set_seed))
 
     assert obs_c2st != None
     print(obs_c2st)

From 785f540edd7c9a3687ef4d83cc9c70eabf417730 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Wed, 26 Jan 2022 16:14:31 +0100
Subject: [PATCH 15/19] removing double bracket

---
 tests/metrics_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index d003d5953..a7fbf39b2 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -99,7 +99,7 @@ def test_onesigma_apart_distributions(set_seed):
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y, seed=set_seed))
+    obs_c2st = c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     print(obs_c2st)
@@ -119,7 +119,7 @@ def test_same_distributions_mlp(set_seed):
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=set_seed))
+    obs_c2st = nn_c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
@@ -169,7 +169,7 @@ def test_onesigma_apart_distributions_mlp(set_seed):
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=set_seed))
+    obs_c2st = nn_c2st(X, Y, seed=set_seed)
 
     assert obs_c2st != None
     print(obs_c2st)

From 5fd41c5082ee116943864d52257b860ab3146e7a Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 27 Jan 2022 10:43:38 +0100
Subject: [PATCH 16/19] refactored c2st interface to support quick change of
 classifier

- 2 functions available: c2st and c2st_scores
- use early stopping for MLP implementation
- c2st is the easier interface
  + allows changing the classifier using a string argument
  + returns the mean score from cross validation
- c2st_scores is more versatile
  + can consume sklearn compatible classifier class
  + returns all scores from the cross validation
---
 sbi/utils/metrics.py  | 119 ++++++++++++++++++++++++++++++++++++++----
 tests/metrics_test.py |  84 ++++++++++-------------------
 2 files changed, 137 insertions(+), 66 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index 802a4385a..e8e185fe2 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -7,12 +7,104 @@
 import torch
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import KFold, cross_val_score
+from sklearn.neural_network import MLPClassifier
 from torch import Tensor
 
-from sbi.utils import tensor2numpy
+# from sbi.utils import tensor2numpy
 
 
 def c2st(
+    X: Tensor,
+    Y: Tensor,
+    seed: int = 1,
+    n_folds: int = 5,
+    scoring: str = "accuracy",
+    z_score: bool = True,
+    noise_scale: Optional[float] = None,
+    verbosity: int = 0,
+    classifier: str = "rf",
+) -> Tensor:
+    """
+    Return accuracy of classifier trained to distinguish samples from supposedly
+    two distributions <X> and <Y>. For details on the method, see [1,2].
+    If the returned accuracy is 0.5, <X> and <Y> are considered to be from the
+    same generating PDF, i.e. they can not be differentiated.
+    If the returned accuracy is around 1., <X> and <Y> are considered to be from
+    two different generating PDFs.
+
+    Training of the classifier with N-fold cross-validation [3] using sklearn.
+    By default, a `RandomForestClassifier` by from `sklearn.ensemble` is used
+    (<classifier> = 'rf'). Alternatively, a multi-layer perceptron is available
+    (<classifier> = 'mlp'). For a small study on the pros and cons for this
+    choice see [4].
+    If you need a more flexible interface which is able to take a sklearn
+    compatible classifier, see the `c2st_` method in this module.
+
+    Args:
+        X: Samples from one distribution.
+        Y: Samples from another distribution.
+        seed: Seed for the sklearn classifier and the KFold cross-validation
+        n_folds: Number of folds to use
+        z_score: Z-scoring using X, i.e. mean and std deviation of X is used to normalize Y using (Y - mean)/std
+        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
+        verbosity: control the verbosity of sklearn.model_selection.cross_val_score
+        classifier: classification architecture to use, possible values: 'rf' or 'mlp'
+
+    Return:
+        torch.tensor containing the mean accuracy score over the test sets
+        from cross-validation
+
+    Example:
+    ``` py
+    > c2st(X,Y)
+    [0.51904464] #X and Y likely come from the same PDF or ensemble
+    > c2st(P,Q)
+    [0.998456] #P and Q likely come from two different PDFs or ensembles
+    ```
+
+    References:
+        [1]: http://arxiv.org/abs/1610.06545
+        [2]: https://www.osti.gov/biblio/826696/
+        [3]: https://scikit-learn.org/stable/modules/cross_validation.html
+        [4]: https://github.com/psteinb/c2st/
+    """
+
+    clf_class = RandomForestClassifier
+    clf_kwargs = {}
+    if "mlp" in classifier.lower():
+        ndim = X.shape[-1]
+        clf_class = MLPClassifier
+        clf_kwargs = {
+            "activation": "relu",
+            "hidden_layer_sizes": (10 * ndim, 10 * ndim),
+            "max_iter": 1000,
+            "solver": "adam",
+            "early_stopping": True,
+            "n_iter_no_change": 50,
+        }
+    elif "rf" in classifier.lower():
+        # keep this section for later use
+        pass
+
+    scores_ = c2st_scores(
+        X,
+        Y,
+        seed=seed,
+        n_folds=n_folds,
+        scoring=scoring,
+        z_score=z_score,
+        noise_scale=noise_scale,
+        verbosity=verbosity,
+        clf_class=clf_class,
+        clf_kwargs=clf_kwargs,
+    )
+
+    scores = np.asarray(np.mean(scores_)).astype(np.float32)
+    value = torch.from_numpy(np.atleast_1d(scores))
+    return value
+
+
+def c2st_scores(
     X: Tensor,
     Y: Tensor,
     seed: int = 1,
@@ -32,9 +124,11 @@ def c2st(
     If the returned accuracy is around 1., <X> and <Y> are considered to be from
     two different generating PDFs.
 
-    Training of the classifier with N-fold cross-validation [3] using sklearn.
-    By default, a `RandomForestClassifier` by from `sklearn.ensemble` is used.
-    This can be changed using <clf_class>.
+    This function performs training of the classifier with N-fold cross-validation [3] using sklearn.
+    By default, a `RandomForestClassifier` by from `sklearn.ensemble` is used which
+    is recommended based on the study performed in [4].
+    This can be changed using <clf_class>. This class is used in the following
+    fashion:
 
     ``` py
     clf = clf_class(random_state=seed, **clf_kwargs)
@@ -44,6 +138,8 @@ def c2st(
     )
     ```
     Further configuration of the classifier can be performed using <clf_kwargs>.
+    If you like to provide a custom class for training, it has to satisfy the
+    internal requirements of `sklearn.model_selection.cross_val_score`.
 
     Args:
         X: Samples from one distribution.
@@ -57,20 +153,24 @@ def c2st(
         clf_kwargs: key-value arguments dictionary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
 
     Return:
-        torch.tensor offering the mean accuracy score over the test sets
-        from cross-validation
+        np.ndarray containing the calculated <scoring> scores over the test set
+        folds from cross-validation
 
     Example:
     ``` py
-    > c2st(X,Y)
-    [0.51904464]
+    > c2st_scores(X,Y)
+    [0.51904464,0.5309201,0.4959452,0.5487709,0.50682926]
     #X and Y likely come from the same PDF or ensemble
+    > c2st_scores(P,Q)
+    [0.998456,0.9982912,0.9980476,0.9980488,0.99805826]
+    #P and Q likely come from two different PDFs or ensembles
     ```
 
     References:
         [1]: http://arxiv.org/abs/1610.06545
         [2]: https://www.osti.gov/biblio/826696/
         [3]: https://scikit-learn.org/stable/modules/cross_validation.html
+        [4]: https://github.com/psteinb/c2st/
     """
     if z_score:
         X_mean = torch.mean(X, axis=0)
@@ -97,8 +197,7 @@ def c2st(
         clf, data, target, cv=shuffle, scoring=scoring, verbose=verbosity
     )
 
-    scores = np.asarray(np.mean(scores)).astype(np.float32)
-    return torch.from_numpy(np.atleast_1d(scores))
+    return scores
 
 
 def unbiased_mmd_squared(x, y):
diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index a7fbf39b2..cbb587674 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -6,52 +6,15 @@
 import numpy as np
 import pytest
 import torch
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import KFold, cross_val_score
 from sklearn.neural_network import MLPClassifier
-from torch import Tensor
 from torch.distributions import MultivariateNormal as tmvn
 
-from sbi.utils.metrics import c2st
+from sbi.utils.metrics import c2st, c2st_scores
 
 ## c2st related:
 ## for a study about c2st see https://github.com/psteinb/c2st/
 
 
-def nn_c2st(
-    X: np.ndarray,
-    Y: np.ndarray,
-    seed: int = 1,
-    n_folds: int = 5,
-    scoring: str = "accuracy",
-    z_score: bool = True,
-    noise_scale: Optional[float] = None,
-    verbosity: int = 0,
-) -> np.ndarray:
-
-    ndim = X.shape[1]
-    clf_class = MLPClassifier
-    clf_kwargs = {
-        "activation": "relu",
-        "hidden_layer_sizes": (10 * ndim, 10 * ndim),
-        "max_iter": 1000,
-        "solver": "adam",
-    }
-
-    return c2st(
-        X,
-        Y,
-        seed,
-        n_folds,
-        scoring,
-        z_score,
-        noise_scale,
-        verbosity,
-        clf_class,
-        clf_kwargs,
-    )
-
-
 def test_same_distributions(set_seed):
 
     ndim = 10
@@ -119,7 +82,7 @@ def test_same_distributions_mlp(set_seed):
     X = xnormal.sample((nsamples,))
     Y = xnormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=set_seed)
+    obs_c2st = c2st(X, Y, seed=set_seed, classifier="mlp")
 
     assert obs_c2st != None
     assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
@@ -137,42 +100,51 @@ def test_diff_distributions_flexible_mlp(set_seed):
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=set_seed)
+    obs_c2st = c2st(X, Y, seed=set_seed, classifier="rf")
 
     assert obs_c2st != None
     assert 0.95 < obs_c2st[0]
 
-    clf_class = MLPClassifier
-    clf_kwargs = {
-        "activation": "relu",
-        "hidden_layer_sizes": (10 * X.shape[1], 10 * X.shape[1]),
-        "max_iter": 1000,
-        "solver": "adam",
-    }
-
-    obs2_c2st = c2st(X, Y, seed=set_seed, clf_class=clf_class, clf_kwargs=clf_kwargs)
+    obs2_c2st = c2st(X, Y, seed=set_seed, classifier="mlp")
 
     assert obs2_c2st != None
     assert 0.95 < obs2_c2st[0]  # only by chance we differentiate the 2 samples
-    assert np.allclose(obs2_c2st, obs_c2st)
+    assert np.allclose(obs2_c2st, obs_c2st, atol=0.01)
 
 
-@pytest.mark.slow
-def test_onesigma_apart_distributions_mlp(set_seed):
+def test_c2st_scores(set_seed):
 
     ndim = 10
     nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=torch.ones(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(loc=20 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = nn_c2st(X, Y, seed=set_seed)
+    obs_c2st = c2st_scores(X, Y, seed=set_seed)
 
-    assert obs_c2st != None
     print(obs_c2st)
     assert (
-        0.8 < obs_c2st[0]
+        0.9 < obs_c2st.mean()
+    )  # distributions do not overlap, classifiers label with high accuracy
+
+    clf_class = MLPClassifier
+    clf_kwargs = {
+        "activation": "relu",
+        "hidden_layer_sizes": (8 * X.shape[1], X.shape[1]),
+        "max_iter": 100,
+        "solver": "adam",
+        "early_stopping": True,
+        "n_iter_no_change": 20,
+    }
+
+    obs2_c2st = c2st_scores(
+        X, Y, seed=set_seed, clf_class=clf_class, clf_kwargs=clf_kwargs
+    )
+
+    assert (
+        0.9 < obs2_c2st.mean()
     )  # distributions do not overlap, classifiers label with high accuracy
+    assert np.allclose(obs2_c2st, obs_c2st, atol=0.05)

From cd68e87807ca26aaab661a9baff64820a47289da Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Thu, 27 Jan 2022 18:01:05 +0100
Subject: [PATCH 17/19] remove unneeded import

---
 sbi/utils/metrics.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index e8e185fe2..de6639586 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -10,8 +10,6 @@
 from sklearn.neural_network import MLPClassifier
 from torch import Tensor
 
-# from sbi.utils import tensor2numpy
-
 
 def c2st(
     X: Tensor,

From d456162b4035226df6dfdae07b848ec3d51362c9 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Fri, 28 Jan 2022 11:11:30 +0100
Subject: [PATCH 18/19] several improvements:

- refactored unit tests into parametrized tests
- all tests below 5s runtime, one pytest call takes below 45s on a 4c laptop
- refactored c2st to offer simplistic interface
- refactored c2st_scores to offer more control
- renamed parameter scoring to metric to prevent confusion with z_score
- removed elif statement
- added type hints
---
 sbi/utils/metrics.py  |  50 ++++++-----
 tests/metrics_test.py | 189 ++++++++++++++++++++++--------------------
 2 files changed, 126 insertions(+), 113 deletions(-)

diff --git a/sbi/utils/metrics.py b/sbi/utils/metrics.py
index de6639586..08919f196 100644
--- a/sbi/utils/metrics.py
+++ b/sbi/utils/metrics.py
@@ -1,7 +1,7 @@
 # This file is part of sbi, a toolkit for simulation-based inference. sbi is licensed
 # under the Affero General Public License v3, see <https://www.gnu.org/licenses/>.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import numpy as np
 import torch
@@ -16,10 +16,7 @@ def c2st(
     Y: Tensor,
     seed: int = 1,
     n_folds: int = 5,
-    scoring: str = "accuracy",
-    z_score: bool = True,
-    noise_scale: Optional[float] = None,
-    verbosity: int = 0,
+    metric: str = "accuracy",
     classifier: str = "rf",
 ) -> Tensor:
     """
@@ -34,18 +31,20 @@ def c2st(
     By default, a `RandomForestClassifier` by from `sklearn.ensemble` is used
     (<classifier> = 'rf'). Alternatively, a multi-layer perceptron is available
     (<classifier> = 'mlp'). For a small study on the pros and cons for this
-    choice see [4].
+    choice see [4]. Before both samples are ingested, they are normalized (z scored)
+    under the assumption that each dimension in X follows a normal distribution, i.e.
+    the mean(X) is subtracted from X and this difference is divided by std(X)
+    for every dimension.
+
     If you need a more flexible interface which is able to take a sklearn
-    compatible classifier, see the `c2st_` method in this module.
+    compatible classifier and more, see the `c2st_` method in this module.
 
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
         seed: Seed for the sklearn classifier and the KFold cross-validation
         n_folds: Number of folds to use
-        z_score: Z-scoring using X, i.e. mean and std deviation of X is used to normalize Y using (Y - mean)/std
-        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
-        verbosity: control the verbosity of sklearn.model_selection.cross_val_score
+        metric: sklearn compliant metric to use for the scoring parameter of cross_val_score
         classifier: classification architecture to use, possible values: 'rf' or 'mlp'
 
     Return:
@@ -67,8 +66,10 @@ def c2st(
         [4]: https://github.com/psteinb/c2st/
     """
 
+    # the default configuration
     clf_class = RandomForestClassifier
     clf_kwargs = {}
+
     if "mlp" in classifier.lower():
         ndim = X.shape[-1]
         clf_class = MLPClassifier
@@ -80,16 +81,17 @@ def c2st(
             "early_stopping": True,
             "n_iter_no_change": 50,
         }
-    elif "rf" in classifier.lower():
-        # keep this section for later use
-        pass
+
+    noise_scale = None
+    z_score = True
+    verbosity = 0
 
     scores_ = c2st_scores(
         X,
         Y,
         seed=seed,
         n_folds=n_folds,
-        scoring=scoring,
+        metric=metric,
         z_score=z_score,
         noise_scale=noise_scale,
         verbosity=verbosity,
@@ -97,6 +99,7 @@ def c2st(
         clf_kwargs=clf_kwargs,
     )
 
+    # TODO: unclear why np.asarray needs to be used here
     scores = np.asarray(np.mean(scores_)).astype(np.float32)
     value = torch.from_numpy(np.atleast_1d(scores))
     return value
@@ -107,12 +110,12 @@ def c2st_scores(
     Y: Tensor,
     seed: int = 1,
     n_folds: int = 5,
-    scoring: str = "accuracy",
+    metric: str = "accuracy",
     z_score: bool = True,
     noise_scale: Optional[float] = None,
     verbosity: int = 0,
-    clf_class=RandomForestClassifier,
-    clf_kwargs={},
+    clf_class: Any = RandomForestClassifier,
+    clf_kwargs: Dict[str, Any] = {},
 ) -> Tensor:
     """
     Return accuracy of classifier trained to distinguish samples from supposedly
@@ -142,16 +145,17 @@ def c2st_scores(
     Args:
         X: Samples from one distribution.
         Y: Samples from another distribution.
-        seed: Seed for the sklearn classifier and the KFold cross-validation
-        n_folds: Number of folds to use
-        z_score: Z-scoring using X, i.e. mean and std deviation of X is used to normalize Y using (Y - mean)/std
-        noise_scale: If passed, will add Gaussian noise with std noise_scale to samples of X and of Y
+        seed: Seed for the sklearn classifier and the KFold cross validation
+        n_folds: Number of folds to use for cross validation
+        metric: sklearn compliant metric to use for the scoring parameter of cross_val_score
+        z_score: Z-scoring using X, i.e. mean and std deviation of X is used to normalize Y, i.e. Y=(Y - mean)/std
+        noise_scale: If passed, will add Gaussian noise with standard deviation <noise_scale> to samples of X and of Y
         verbosity: control the verbosity of sklearn.model_selection.cross_val_score
         clf_class: a scikit-learn classifier class
         clf_kwargs: key-value arguments dictionary to the class specified by clf_class, e.g. sklearn.ensemble.RandomForestClassifier
 
     Return:
-        np.ndarray containing the calculated <scoring> scores over the test set
+        np.ndarray containing the calculated <metric> scores over the test set
         folds from cross-validation
 
     Example:
@@ -192,7 +196,7 @@ def c2st_scores(
 
     shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
     scores = cross_val_score(
-        clf, data, target, cv=shuffle, scoring=scoring, verbose=verbosity
+        clf, data, target, cv=shuffle, scoring=metric, verbose=verbosity
     )
 
     return scores
diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index cbb587674..2e1bd310f 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -15,120 +15,130 @@
 ## for a study about c2st see https://github.com/psteinb/c2st/
 
 
-def test_same_distributions(set_seed):
+@pytest.mark.parametrize(
+    "dist_sigma, c2st_lowerbound, c2st_upperbound,",
+    [
+        (
+            0.0,
+            0.45,
+            0.55,
+        ),  # both samples are identical, the mean accuracy should be around 0.5
+        (
+            1.0,
+            0.85,
+            1.0,
+        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
+        (
+            20.0,
+            0.98,
+            1.0,
+        ),  # both samples are very far apart, the mean accuracy should close to 1.
+    ],
+)
+def test_c2st_with_different_distributions(
+    dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed
+):
 
     ndim = 10
     nsamples = 1024
 
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-
-    X = xnormal.sample((nsamples,))
-    Y = xnormal.sample((nsamples,))
-
-    obs_c2st = c2st(X, Y, seed=set_seed)
-
-    assert obs_c2st != None
-    assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
-
-
-def test_diff_distributions(set_seed):
-
-    ndim = 10
-    nsamples = 1024
-
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
-
-    X = xnormal.sample((nsamples,))
-    Y = ynormal.sample((nsamples,))
-
-    obs_c2st = c2st(X, Y, seed=set_seed)
-
-    assert obs_c2st != None
-    assert (
-        0.98 < obs_c2st[0]
-    )  # distributions do not overlap, classifiers label with high accuracy
-    print(obs_c2st)
-
-
-def test_onesigma_apart_distributions(set_seed):
-
-    ndim = 10
-    nsamples = 1024
-
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=torch.ones(ndim), covariance_matrix=torch.eye(ndim))
+    refdist = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    otherdist = tmvn(
+        loc=dist_sigma + torch.zeros(ndim), covariance_matrix=torch.eye(ndim)
+    )
 
-    X = xnormal.sample((nsamples,))
-    Y = ynormal.sample((nsamples,))
+    X = refdist.sample((nsamples,))
+    Y = otherdist.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y, seed=set_seed)
+    obs_c2st = c2st(X, Y)
 
-    assert obs_c2st != None
-    print(obs_c2st)
-    assert (
-        0.85 < obs_c2st[0]
-    )  # distributions do not overlap, classifiers label with high accuracy
+    assert len(obs_c2st) > 0
+    assert c2st_lowerbound < obs_c2st[0]
+    assert obs_c2st[0] <= c2st_upperbound
 
 
 @pytest.mark.slow
-def test_same_distributions_mlp(set_seed):
+@pytest.mark.parametrize(
+    "dist_sigma, c2st_lowerbound, c2st_upperbound,",
+    [
+        (
+            0.0,
+            0.45,
+            0.55,
+        ),  # both samples are identical, the mean accuracy should be around 0.5
+        (
+            1.0,
+            0.85,
+            1.0,
+        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
+        (
+            20.0,
+            0.98,
+            1.0,
+        ),  # both samples are very far apart, the mean accuracy should close to 1.
+    ],
+)
+def test_c2st_with_different_distributions_mlp(
+    dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed
+):
 
     ndim = 10
     nsamples = 1024
 
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    refdist = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
+    otherdist = tmvn(
+        loc=dist_sigma + torch.zeros(ndim), covariance_matrix=torch.eye(ndim)
+    )
 
-    X = xnormal.sample((nsamples,))
-    Y = xnormal.sample((nsamples,))
+    X = refdist.sample((nsamples,))
+    Y = otherdist.sample((nsamples,))
 
-    obs_c2st = c2st(X, Y, seed=set_seed, classifier="mlp")
+    obs_c2st = c2st(X, Y, classifier="mlp")
 
-    assert obs_c2st != None
-    assert 0.45 < obs_c2st[0] < 0.55  # only by chance we differentiate the 2 samples
+    assert len(obs_c2st) > 0
+    assert c2st_lowerbound < obs_c2st[0]
+    assert obs_c2st[0] <= c2st_upperbound
 
 
 @pytest.mark.slow
-def test_diff_distributions_flexible_mlp(set_seed):
+@pytest.mark.parametrize(
+    "dist_sigma, c2st_lowerbound, c2st_upperbound,",
+    [
+        (
+            0.0,
+            0.45,
+            0.55,
+        ),  # both samples are identical, the mean accuracy should be around 0.5
+        (
+            1.0,
+            0.85,
+            1.0,
+        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
+        (
+            20.0,
+            0.98,
+            1.0,
+        ),  # both samples are very far apart, the mean accuracy should close to 1.
+    ],
+)
+def test_c2st_scores(dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed):
 
     ndim = 10
     nsamples = 1024
 
     xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20.0 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
-
-    X = xnormal.sample((nsamples,))
-    Y = ynormal.sample((nsamples,))
-
-    obs_c2st = c2st(X, Y, seed=set_seed, classifier="rf")
-
-    assert obs_c2st != None
-    assert 0.95 < obs_c2st[0]
-
-    obs2_c2st = c2st(X, Y, seed=set_seed, classifier="mlp")
-
-    assert obs2_c2st != None
-    assert 0.95 < obs2_c2st[0]  # only by chance we differentiate the 2 samples
-    assert np.allclose(obs2_c2st, obs_c2st, atol=0.01)
-
-
-def test_c2st_scores(set_seed):
-
-    ndim = 10
-    nsamples = 1024
-
-    xnormal = tmvn(loc=torch.zeros(ndim), covariance_matrix=torch.eye(ndim))
-    ynormal = tmvn(loc=20 * torch.ones(ndim), covariance_matrix=torch.eye(ndim))
+    ynormal = tmvn(
+        loc=dist_sigma + torch.zeros(ndim), covariance_matrix=torch.eye(ndim)
+    )
 
     X = xnormal.sample((nsamples,))
     Y = ynormal.sample((nsamples,))
 
-    obs_c2st = c2st_scores(X, Y, seed=set_seed)
+    obs_c2st = c2st_scores(X, Y)
 
-    print(obs_c2st)
-    assert (
-        0.9 < obs_c2st.mean()
-    )  # distributions do not overlap, classifiers label with high accuracy
+    assert hasattr(obs_c2st, "mean")
+    assert c2st_lowerbound < obs_c2st.mean()
+    assert obs_c2st.mean() <= c2st_upperbound
 
     clf_class = MLPClassifier
     clf_kwargs = {
@@ -140,11 +150,10 @@ def test_c2st_scores(set_seed):
         "n_iter_no_change": 20,
     }
 
-    obs2_c2st = c2st_scores(
-        X, Y, seed=set_seed, clf_class=clf_class, clf_kwargs=clf_kwargs
-    )
+    obs2_c2st = c2st_scores(X, Y, clf_class=clf_class, clf_kwargs=clf_kwargs)
+
+    assert hasattr(obs2_c2st, "mean")
+    assert c2st_lowerbound < obs2_c2st.mean()
+    assert obs2_c2st.mean() <= c2st_upperbound
 
-    assert (
-        0.9 < obs2_c2st.mean()
-    )  # distributions do not overlap, classifiers label with high accuracy
     assert np.allclose(obs2_c2st, obs_c2st, atol=0.05)

From 5456e092d5d82c198ea668f97a14f7f314cf20e6 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <p.steinbach@hzdr.de>
Date: Fri, 28 Jan 2022 11:19:24 +0100
Subject: [PATCH 19/19] refactored reoccurring test case configs out

---
 tests/metrics_test.py | 75 ++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 51 deletions(-)

diff --git a/tests/metrics_test.py b/tests/metrics_test.py
index 2e1bd310f..f7fc5e21a 100644
--- a/tests/metrics_test.py
+++ b/tests/metrics_test.py
@@ -14,26 +14,31 @@
 ## c2st related:
 ## for a study about c2st see https://github.com/psteinb/c2st/
 
+TESTCASECONFIG = [
+    (
+        # both samples are identical, the mean accuracy should be around 0.5
+        0.0,  # dist_sigma
+        0.45,  # c2st_lowerbound
+        0.55,  # c2st_upperbound
+    ),
+    (
+        # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
+        1.0,
+        0.85,
+        1.0,
+    ),
+    (
+        # both samples are very far apart, the mean accuracy should close to 1.
+        20.0,
+        0.98,
+        1.0,
+    ),
+]
+
 
 @pytest.mark.parametrize(
     "dist_sigma, c2st_lowerbound, c2st_upperbound,",
-    [
-        (
-            0.0,
-            0.45,
-            0.55,
-        ),  # both samples are identical, the mean accuracy should be around 0.5
-        (
-            1.0,
-            0.85,
-            1.0,
-        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
-        (
-            20.0,
-            0.98,
-            1.0,
-        ),  # both samples are very far apart, the mean accuracy should close to 1.
-    ],
+    TESTCASECONFIG,
 )
 def test_c2st_with_different_distributions(
     dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed
@@ -60,23 +65,7 @@ def test_c2st_with_different_distributions(
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "dist_sigma, c2st_lowerbound, c2st_upperbound,",
-    [
-        (
-            0.0,
-            0.45,
-            0.55,
-        ),  # both samples are identical, the mean accuracy should be around 0.5
-        (
-            1.0,
-            0.85,
-            1.0,
-        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
-        (
-            20.0,
-            0.98,
-            1.0,
-        ),  # both samples are very far apart, the mean accuracy should close to 1.
-    ],
+    TESTCASECONFIG,
 )
 def test_c2st_with_different_distributions_mlp(
     dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed
@@ -103,23 +92,7 @@ def test_c2st_with_different_distributions_mlp(
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "dist_sigma, c2st_lowerbound, c2st_upperbound,",
-    [
-        (
-            0.0,
-            0.45,
-            0.55,
-        ),  # both samples are identical, the mean accuracy should be around 0.5
-        (
-            1.0,
-            0.85,
-            1.0,
-        ),  # both samples are rather close, the mean accuracy should be larger than 0.5 and be lower than 1.
-        (
-            20.0,
-            0.98,
-            1.0,
-        ),  # both samples are very far apart, the mean accuracy should close to 1.
-    ],
+    TESTCASECONFIG,
 )
 def test_c2st_scores(dist_sigma, c2st_lowerbound, c2st_upperbound, set_seed):