Skip to content

Commit

Permalink
Merge pull request #114 from el-hult/nanhunt
Browse files Browse the repository at this point in the history
Improve calibration
  • Loading branch information
danieleongari authored Jul 9, 2024
2 parents ffa7227 + 6ae1d9e commit 0dc01e2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 37 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@ paper/*.txt

# Jupiter notebooks
.ipynb_checkpoints

# pyenv environment
.python-version
78 changes: 41 additions & 37 deletions forestci/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
random forest is small.
"""
import functools
import itertools
import warnings
import numpy as np
from scipy.optimize import minimize
from scipy.signal import fftconvolve
Expand All @@ -32,7 +31,7 @@
path='forestci')


def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1):
"""
Fit empirical Bayes prior in the hierarchical model [Efron2014]_.
Expand All @@ -47,62 +46,66 @@ def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
sigma: float
Noise estimate on X.
p: int
Number of parameters used to fit G. Default: 5
Number of parameters used to fit G.
nbin: int
Number of bins used for discrete approximation.
Default: 200
unif_fraction: float
Fraction of G modeled as "slab". Default: 0.1
Fraction of G modeled as "slab".
Returns
-------
An array of the posterior density estimate g.
"""
min_x = min(min(X) - 2 * np.std(X, ddof=1), 0)
min_x = max(min(X) - 2 * np.std(X, ddof=1), 0)
max_x = max(max(X) + 2 * np.std(X, ddof=1),
np.std(X, ddof=1))
xvals = np.linspace(min_x, max_x, nbin)
binw = (max_x - min_x) / (nbin - 1)

zero_idx = max(np.where(xvals <= 0)[0])
noise_kernel = norm().pdf(xvals / sigma) * binw / sigma
noise_kernel = norm(scale=sigma,loc=xvals.mean()).pdf(xvals)
noise_kernel /= noise_kernel.sum()

if zero_idx > 0:
noise_rotate = noise_kernel[list(np.arange(zero_idx, len(xvals))) +
list(np.arange(0, zero_idx))]
else:
noise_rotate = noise_kernel
mask = xvals > 0
assert sum(mask) > 0
g_eta_slab = mask / sum(mask)

XX = np.zeros((p, len(xvals)), dtype="float")
for ind, exp in enumerate(range(1, p+1)):
mask = np.ones_like(xvals)
mask[np.where(xvals <= 0)[0]] = 0
XX[ind, :] = pow(xvals, exp) * mask
XX = XX.T
XX = np.column_stack([ pow(xvals, exp) for exp in range(1, p+1)])
XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability

def neg_loglik(eta):
mask = np.ones_like(xvals)
mask[np.where(xvals <= 0)[0]] = 0
g_eta_raw = np.exp(np.dot(XX, eta)) * mask
with np.errstate(over='ignore'):
# if eta > 0 the exponential will likely get overflow. that is fine.
g_eta_raw = np.exp(np.dot(XX, eta)) * mask

if ((np.sum(g_eta_raw) == np.inf) |
(np.sum(g_eta_raw) <=
100 * np.finfo(np.double).tiny)):
return (1000 * (len(X) + sum(eta ** 2)))

assert sum(g_eta_raw) > 0, "Unexpected error"
assert np.isfinite(sum(g_eta_raw)), "Unexpected error"
g_eta_main = g_eta_raw / sum(g_eta_raw)
g_eta = ((1 - unif_fraction) * g_eta_main +
unif_fraction * mask / sum(mask))
f_eta = fftconvolve(g_eta, noise_rotate, mode='same')
g_eta = (
(1 - unif_fraction) * g_eta_main +
unif_fraction * g_eta_slab)
f_eta = fftconvolve(g_eta, noise_kernel, mode='same')
return np.sum(np.interp(X, xvals,
-np.log(np.maximum(f_eta, 0.0000001))))

eta_hat = minimize(neg_loglik,
list(itertools.repeat(-1, p))).x
res = minimize(
neg_loglik,
np.full(p, -1, dtype='float'),
tol=5e-5 # adjusted so that the MPG example in the docs passes
)
if not res.success:
warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message)
eta_hat = res.x
g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask
g_eta_main = g_eta_raw / sum(g_eta_raw)
g_eta = ((1 - unif_fraction) * g_eta_main +
unif_fraction * mask) / sum(mask)
g_eta = (
(1 - unif_fraction) * g_eta_main +
unif_fraction * g_eta_slab)

assert np.all(np.isfinite(g_eta)), "Fitting the empirical bayes prior failed."
return xvals, g_eta


Expand All @@ -114,8 +117,10 @@ def gbayes(x0, g_est, sigma):
----------
x0: ndarray
an observation
g_est: float
g_est: (ndarray,ndarray)
a prior density, as returned by gfit
g_est[0] is the x-positions
g_est[1] is the densities
sigma: int
noise estimate
Expand Down Expand Up @@ -147,18 +152,17 @@ def calibrateEB(variances, sigma2):
"""
if (sigma2 <= 0 or min(variances) == max(variances)):
return(np.maximum(variances, 0))

sigma = np.sqrt(sigma2)
eb_prior = gfit(variances, sigma)
# Set up a partial execution of the function
part = functools.partial(gbayes, g_est=eb_prior,
sigma=sigma)

if len(variances) >= 200:
# Interpolate to speed up computations:
calib_x = np.percentile(variances,
np.arange(0, 102, 2))
calib_y = list(map(part, calib_x))
calib_y = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in calib_x]
calib_all = np.interp(variances, calib_x, calib_y)
else:
calib_all = list(map(part, variances))
calib_all = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in variances]

return np.asarray(calib_all)

0 comments on commit 0dc01e2

Please sign in to comment.