Merge pull request #114 from el-hult/nanhunt

Improve calibration
scikit-learn-contrib · Jul 9, 2024 · 0dc01e2 · 0dc01e2
2 parents ffa7227 + 6ae1d9e
commit 0dc01e2
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -74,3 +74,6 @@ paper/*.txt
 
 # Jupiter notebooks
 .ipynb_checkpoints
+
+# pyenv environment
+.python-version
diff --git a/forestci/calibration.py b/forestci/calibration.py
@@ -5,8 +5,7 @@
 random forest is small.
 
 """
-import functools
-import itertools
+import warnings
 import numpy as np
 from scipy.optimize import minimize
 from scipy.signal import fftconvolve
@@ -32,7 +31,7 @@
           path='forestci')
 
 
-def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
+def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1):
     """
     Fit empirical Bayes prior in the hierarchical model [Efron2014]_.
 
@@ -47,62 +46,66 @@ def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
     sigma: float
         Noise estimate on X.
     p: int
-        Number of parameters used to fit G. Default: 5
+        Number of parameters used to fit G.
     nbin: int
         Number of bins used for discrete approximation.
-        Default: 200
     unif_fraction: float
-        Fraction of G modeled as "slab". Default: 0.1
+        Fraction of G modeled as "slab".
 
     Returns
     -------
     An array of the posterior density estimate g.
     """
-    min_x = min(min(X) - 2 * np.std(X, ddof=1), 0)
+    min_x = max(min(X) - 2 * np.std(X, ddof=1), 0)
     max_x = max(max(X) + 2 * np.std(X, ddof=1),
                 np.std(X, ddof=1))
     xvals = np.linspace(min_x, max_x, nbin)
-    binw = (max_x - min_x) / (nbin - 1)
 
-    zero_idx = max(np.where(xvals <= 0)[0])
-    noise_kernel = norm().pdf(xvals / sigma) * binw / sigma
+    noise_kernel = norm(scale=sigma,loc=xvals.mean()).pdf(xvals)
+    noise_kernel /= noise_kernel.sum()
 
-    if zero_idx > 0:
-        noise_rotate = noise_kernel[list(np.arange(zero_idx, len(xvals))) +
-                                    list(np.arange(0, zero_idx))]
-    else:
-        noise_rotate = noise_kernel
+    mask = xvals > 0
+    assert sum(mask) > 0
+    g_eta_slab = mask / sum(mask)
 
-    XX = np.zeros((p, len(xvals)), dtype="float")
-    for ind, exp in enumerate(range(1, p+1)):
-        mask = np.ones_like(xvals)
-        mask[np.where(xvals <= 0)[0]] = 0
-        XX[ind, :] = pow(xvals, exp) * mask
-    XX = XX.T
+    XX = np.column_stack([ pow(xvals, exp) for exp in range(1, p+1)])
+    XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability
 
     def neg_loglik(eta):
-        mask = np.ones_like(xvals)
-        mask[np.where(xvals <= 0)[0]] = 0
-        g_eta_raw = np.exp(np.dot(XX, eta)) * mask
+        with np.errstate(over='ignore'):
+            # if eta > 0 the exponential will likely get overflow. that is fine.
+            g_eta_raw = np.exp(np.dot(XX, eta)) * mask
+
         if ((np.sum(g_eta_raw) == np.inf) |
             (np.sum(g_eta_raw) <=
                 100 * np.finfo(np.double).tiny)):
                 return (1000 * (len(X) + sum(eta ** 2)))
 
+        assert sum(g_eta_raw) > 0, "Unexpected error"
+        assert np.isfinite(sum(g_eta_raw)), "Unexpected error"
         g_eta_main = g_eta_raw / sum(g_eta_raw)
-        g_eta = ((1 - unif_fraction) * g_eta_main +
-                 unif_fraction * mask / sum(mask))
-        f_eta = fftconvolve(g_eta, noise_rotate, mode='same')
+        g_eta = (
+        (1 - unif_fraction) * g_eta_main +
+             unif_fraction * g_eta_slab)
+        f_eta = fftconvolve(g_eta, noise_kernel, mode='same')
         return np.sum(np.interp(X, xvals,
                       -np.log(np.maximum(f_eta, 0.0000001))))
 
-    eta_hat = minimize(neg_loglik,
-                       list(itertools.repeat(-1, p))).x
+    res = minimize(
+        neg_loglik,
+        np.full(p, -1, dtype='float'),
+        tol=5e-5 # adjusted so that the MPG example in the docs passes
+    )
+    if not res.success:
+        warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message)
+    eta_hat = res.x
     g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask
     g_eta_main = g_eta_raw / sum(g_eta_raw)
-    g_eta = ((1 - unif_fraction) * g_eta_main +
-             unif_fraction * mask) / sum(mask)
+    g_eta = (
+        (1 - unif_fraction) * g_eta_main +
+             unif_fraction * g_eta_slab)
 
+    assert np.all(np.isfinite(g_eta)), "Fitting the empirical bayes prior failed."
     return xvals, g_eta
 
 
@@ -114,8 +117,10 @@ def gbayes(x0, g_est, sigma):
     ----------
     x0: ndarray
         an observation
-    g_est: float
+    g_est: (ndarray,ndarray)
         a prior density, as returned by gfit
+        g_est[0] is the x-positions
+        g_est[1] is the densities
     sigma: int
         noise estimate
 
@@ -147,18 +152,17 @@ def calibrateEB(variances, sigma2):
     """
     if (sigma2 <= 0 or min(variances) == max(variances)):
         return(np.maximum(variances, 0))
+
     sigma = np.sqrt(sigma2)
     eb_prior = gfit(variances, sigma)
-    # Set up a partial execution of the function
-    part = functools.partial(gbayes, g_est=eb_prior,
-                             sigma=sigma)
+
     if len(variances) >= 200:
         # Interpolate to speed up computations:
         calib_x = np.percentile(variances,
                                 np.arange(0, 102, 2))
-        calib_y = list(map(part, calib_x))
+        calib_y = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in calib_x]
         calib_all = np.interp(variances, calib_x, calib_y)
     else:
-        calib_all = list(map(part, variances))
+        calib_all = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in variances]
 
     return np.asarray(calib_all)