From 4ec5dc291a0326147a0b0541bfe64dfeb141baf6 Mon Sep 17 00:00:00 2001 From: Ludvig Hult Date: Wed, 19 Jun 2024 13:25:41 +0200 Subject: [PATCH 1/3] attemped fix of the EB computation --- .gitignore | 3 ++ forestci/calibration.py | 79 ++++++++++++++++++++++------------------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 1713642..87a1e71 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,6 @@ paper/*.txt # Jupiter notebooks .ipynb_checkpoints + +# pyenv environment +.python-version \ No newline at end of file diff --git a/forestci/calibration.py b/forestci/calibration.py index 3a42f61..5da7ec6 100644 --- a/forestci/calibration.py +++ b/forestci/calibration.py @@ -5,8 +5,7 @@ random forest is small. """ -import functools -import itertools +import warnings import numpy as np from scipy.optimize import minimize from scipy.signal import fftconvolve @@ -32,7 +31,7 @@ path='forestci') -def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1): +def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1): """ Fit empirical Bayes prior in the hierarchical model [Efron2014]_. @@ -47,62 +46,67 @@ def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1): sigma: float Noise estimate on X. p: int - Number of parameters used to fit G. Default: 5 + Number of parameters used to fit G. nbin: int Number of bins used for discrete approximation. - Default: 200 unif_fraction: float - Fraction of G modeled as "slab". Default: 0.1 + Fraction of G modeled as "slab". Returns ------- An array of the posterior density estimate g. """ - min_x = min(min(X) - 2 * np.std(X, ddof=1), 0) + min_x = max(min(X) - 2 * np.std(X, ddof=1), 0) max_x = max(max(X) + 2 * np.std(X, ddof=1), np.std(X, ddof=1)) xvals = np.linspace(min_x, max_x, nbin) - binw = (max_x - min_x) / (nbin - 1) - zero_idx = max(np.where(xvals <= 0)[0]) - noise_kernel = norm().pdf(xvals / sigma) * binw / sigma + noise_kernel = norm(scale=sigma,loc=xvals.mean()).pdf(xvals) + noise_kernel /= noise_kernel.sum() - if zero_idx > 0: - noise_rotate = noise_kernel[list(np.arange(zero_idx, len(xvals))) + - list(np.arange(0, zero_idx))] - else: - noise_rotate = noise_kernel + mask = xvals > 0 + assert sum(mask) > 0 + g_eta_slab = mask / sum(mask) - XX = np.zeros((p, len(xvals)), dtype="float") - for ind, exp in enumerate(range(1, p+1)): - mask = np.ones_like(xvals) - mask[np.where(xvals <= 0)[0]] = 0 - XX[ind, :] = pow(xvals, exp) * mask - XX = XX.T + XX = np.column_stack([ np.pow(xvals, exp) for exp in range(1, p+1)]) + XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability def neg_loglik(eta): - mask = np.ones_like(xvals) - mask[np.where(xvals <= 0)[0]] = 0 - g_eta_raw = np.exp(np.dot(XX, eta)) * mask + with np.errstate(over='ignore'): + # if eta > 0 the exponential will likely get overflow. that is fine. + g_eta_raw = np.exp(np.dot(XX, eta)) * mask + if ((np.sum(g_eta_raw) == np.inf) | (np.sum(g_eta_raw) <= 100 * np.finfo(np.double).tiny)): return (1000 * (len(X) + sum(eta ** 2))) + assert sum(g_eta_raw) > 0, "Unexpected error" + assert np.isfinite(sum(g_eta_raw)), "Unexpected error" g_eta_main = g_eta_raw / sum(g_eta_raw) - g_eta = ((1 - unif_fraction) * g_eta_main + - unif_fraction * mask / sum(mask)) - f_eta = fftconvolve(g_eta, noise_rotate, mode='same') + g_eta = ( + (1 - unif_fraction) * g_eta_main + + unif_fraction * g_eta_slab) + f_eta = fftconvolve(g_eta, noise_kernel, mode='same') return np.sum(np.interp(X, xvals, -np.log(np.maximum(f_eta, 0.0000001)))) - eta_hat = minimize(neg_loglik, - list(itertools.repeat(-1, p))).x + res = minimize( + neg_loglik, + np.full(p, -1, dtype='float'), + tol=5e-5 # adjusted so that the MPG example in the docs passes + ) + if not res.success: + warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message) + eta_hat = res.x + print(eta_hat) g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask g_eta_main = g_eta_raw / sum(g_eta_raw) - g_eta = ((1 - unif_fraction) * g_eta_main + - unif_fraction * mask) / sum(mask) + g_eta = ( + (1 - unif_fraction) * g_eta_main + + unif_fraction * g_eta_slab) + assert np.all(np.isfinite(g_eta)), "Fitting the empirical bayes prior failed." return xvals, g_eta @@ -114,8 +118,10 @@ def gbayes(x0, g_est, sigma): ---------- x0: ndarray an observation - g_est: float + g_est: (ndarray,ndarray) a prior density, as returned by gfit + g_est[0] is the x-positions + g_est[1] is the densities sigma: int noise estimate @@ -147,18 +153,17 @@ def calibrateEB(variances, sigma2): """ if (sigma2 <= 0 or min(variances) == max(variances)): return(np.maximum(variances, 0)) + sigma = np.sqrt(sigma2) eb_prior = gfit(variances, sigma) - # Set up a partial execution of the function - part = functools.partial(gbayes, g_est=eb_prior, - sigma=sigma) + if len(variances) >= 200: # Interpolate to speed up computations: calib_x = np.percentile(variances, np.arange(0, 102, 2)) - calib_y = list(map(part, calib_x)) + calib_y = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in calib_x] calib_all = np.interp(variances, calib_x, calib_y) else: - calib_all = list(map(part, variances)) + calib_all = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in variances] return np.asarray(calib_all) From a6f1b04df46b58d5bef2357bd8fc524dd4b46aee Mon Sep 17 00:00:00 2001 From: Ludvig Hult Date: Wed, 19 Jun 2024 13:44:42 +0200 Subject: [PATCH 2/3] remove forgotten debug-print statement --- forestci/calibration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forestci/calibration.py b/forestci/calibration.py index 5da7ec6..e5e6846 100644 --- a/forestci/calibration.py +++ b/forestci/calibration.py @@ -99,7 +99,6 @@ def neg_loglik(eta): if not res.success: warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message) eta_hat = res.x - print(eta_hat) g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask g_eta_main = g_eta_raw / sum(g_eta_raw) g_eta = ( From 6ae1d9e6089235011a4da70854f03df5b406d999 Mon Sep 17 00:00:00 2001 From: Ludvig Hult Date: Wed, 19 Jun 2024 14:39:12 +0200 Subject: [PATCH 3/3] use builtin pow, since np.pow is only for numpy version 2.0.0 and later --- forestci/calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forestci/calibration.py b/forestci/calibration.py index e5e6846..828cd36 100644 --- a/forestci/calibration.py +++ b/forestci/calibration.py @@ -68,7 +68,7 @@ def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1): assert sum(mask) > 0 g_eta_slab = mask / sum(mask) - XX = np.column_stack([ np.pow(xvals, exp) for exp in range(1, p+1)]) + XX = np.column_stack([ pow(xvals, exp) for exp in range(1, p+1)]) XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability def neg_loglik(eta):