Skip to content

Commit

Permalink
Fast l1 regularization path with influence adapted boosting
Browse files Browse the repository at this point in the history
  • Loading branch information
Blunde1 committed Feb 20, 2024
1 parent f936b9e commit e584bcd
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 3 deletions.
10 changes: 7 additions & 3 deletions src/ert/analysis/_es_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ..config.analysis_module import ESSettings, IESSettings
from . import misfit_preprocessor
from .event import AnalysisEvent, AnalysisStatusEvent, AnalysisTimeEvent
from .linear_l1_regression import linear_l1_regression
from .linear_l1_regression import linear_boost_ic_regression
from .row_scaling import RowScaling
from .update import RowScalingParameter

Expand Down Expand Up @@ -791,7 +791,9 @@ def analysis_ES(
X_local = temp_storage[param_group.name][active_indices, :]

# Estimate Kalman gain
K_lasso = linear_l1_regression(D=Y_noisy.T, X=X_local.T)
# K_lasso = linear_l1_regression(D=Y_noisy.T, X=X_local.T)
# This is an approximate l1 regularization path
K_lasso = linear_boost_ic_regression(D=Y_noisy.T, X=X_local.T)

observation_values_reshaped = observation_values[
:, np.newaxis
Expand Down Expand Up @@ -830,7 +832,9 @@ def analysis_ES(
X_local = temp_storage[param_group.name]

# Estimate Kalman gain
K_lasso = linear_l1_regression(D=Y_noisy.T, X=X_local.T)
# K_lasso = linear_l1_regression(D=Y_noisy.T, X=X_local.T)
# This is an approximate l1 regularization path
K_lasso = linear_boost_ic_regression(D=Y_noisy.T, X=X_local.T)

observation_values_reshaped = observation_values[
:, np.newaxis
Expand Down
170 changes: 170 additions & 0 deletions src/ert/analysis/linear_l1_regression.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
from scipy.integrate import quad
from scipy.stats import chi2
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
Expand Down Expand Up @@ -67,3 +69,171 @@ def linear_l1_regression(D, X, max_iter=100, cv_folds=5):
assert H.shape == (m, p), "Shape of H_sparse must be (m, p)"

return H


def expected_max_chisq(p):
"""Expected maximum of p central chi-square(1) random variables"""

def dmaxchisq(x):
return 1.0 - np.exp(p * chi2.logcdf(x, df=1))

expectation, _ = quad(dmaxchisq, 0, np.inf)
return expectation


def mse(residuals):
return 0.5 * np.mean(residuals**2)


def calculate_psi_M(x, y, beta_estimate):
"""The psi/score function for mse: 0.5*residual**2"""
residuals = y - beta_estimate * x
psi = -residuals * x
M = -np.mean(x**2)
return psi, M


def calculate_influence(x, y, beta_estimate):
"""The influence of (x, y) on beta_estimate as an mse M-estimator"""
psi, M = calculate_psi_M(x, y, beta_estimate)
return psi / M


def boost_linear_regression(X, y, learning_rate=0.3, tol=1e-6, max_iter=10000):
"""Boost coefficients of linearly regressing y on standardized X.
The coefficient selection utilizes information theoretic weighting.
The stopping criterion utilizes information theoretic loss-reduction.
"""
n_samples, n_features = X.shape
coefficients = np.zeros(n_features)
residuals, residuals_loo = y.copy(), y.copy()
y_mse = mse(y)
previous_residual_mse = y_mse
mse_factor = expected_max_chisq(n_features)
comparison_factor_aic = 1 + mse_factor / n_samples
# The (fast) AIC comparison factor assumes the AIC on mse as n_features/n_samples
# A stricter criterion is the loo-adjustment: mse(residuals_loo)-mse(residuals). This converges to TIC. Under certain conditions this is AIC.
# At worst, we are maximizing squares. See Lunde 2020 Appendix A. This needs to be adjusted for.
# The mse_factor adjusts for this.

for iteration in range(max_iter):
coef_changes = np.zeros(n_features)

# Calculate coefficient change for each feature
for feature in range(n_features):
coef_changes[feature] = np.dot(X[:, feature], residuals) / n_samples

# adjust based on information-criterion penalty
feature_evaluation = np.zeros(n_features)
for feature in range(n_features):
beta_estimate_j = coef_changes[feature]
residuals_j = residuals - beta_estimate_j * X[:, feature]
residual_mse_j = mse(residuals_j) # np.cov(residuals_j, rowvar=False)
if coefficients[feature] == 0:
# add IC penalty: for mse, this is 1 x conditional variance (aic (fast) context), because feature is not added
# The added feature IC penalty is constant for both models, and therefore not added.
feature_evaluation[feature] = residual_mse_j * comparison_factor_aic
else:
feature_evaluation[feature] = residual_mse_j

# Select feature based on loss criterion
best_feature = np.argmin(feature_evaluation)
beta_estimate = coef_changes[best_feature]
coef_change = beta_estimate * learning_rate

# adjust to loo estimates for coef_change
influence = calculate_influence(X[:, best_feature], residuals, beta_estimate)
beta_estimate_loo = beta_estimate - influence / n_samples
coef_change_loo = beta_estimate_loo * learning_rate

# Make sure boosting should start at all
if iteration == 0:
residuals_full = y - beta_estimate * X[:, best_feature]
residuals_full_loo = y - beta_estimate_loo * X[:, best_feature]
if y_mse < mse(residuals_full) + mse_factor * (
mse(residuals_full_loo) - mse(residuals_full)
):
break

# Update residuals
residuals -= coef_change * X[:, best_feature]
residuals_loo -= coef_change_loo * X[:, best_feature]

# Do loo-cv-square-maximized adjusted mse residual estimate
new_residual_mse = mse(residuals) + mse_factor * (
mse(residuals_loo) - mse(residuals)
)

# Check for convergence
if np.abs(coef_change) < tol or previous_residual_mse < new_residual_mse:
break
else:
# Update
previous_residual_mse = new_residual_mse
coefficients[best_feature] += coef_change

return coefficients


def linear_boost_ic_regression(D, X):
"""
Performs LASSO regression for each response in X against predictors in D,
constructing a sparse matrix of regression coefficients.
The function scales features in D using standard scaling before applying
LASSO, then re-scales the coefficients to the original scale of D. This
extracts the effect of each feature in D on each response in X, ignoring
intercepts and constant terms.
Parameters
----------
D : np.ndarray
2D array of predictors with shape (n, p).
X : np.ndarray
2D array of responses with shape (n, m).
Returns
-------
H : np.ndarray
2D array of responses with shape (m, p) with re-scaled LASSO
regression coefficients for each response in X.
Raises
------
AssertionError
If the number of samples in D and X do not match, or if the shape of
H is not (m, p).
"""
n, p = D.shape # p: number of features
n_y, m = X.shape # m: number of y responses

# Assert that the first dimension of D and X are the same
assert n == n_y, "Number of samples in D and X must be the same"

scaler_d = StandardScaler()
D_scaled = scaler_d.fit_transform(D)

scaler_x = StandardScaler()
X_scaled = scaler_x.fit_transform(X)

# Loop over features
H = np.zeros((m, p))
for j in tqdm(range(m), desc="Learning sparse linear map for each response"):
x_j = X_scaled[:, j]

# Learn individual fit
coefficients_j = boost_linear_regression(D_scaled, x_j)

# Extract coefficients
for non_zero_ind in coefficients_j.nonzero()[0]:
H[j, non_zero_ind] = (
scaler_x.scale_[j]
* coefficients_j[non_zero_ind]
/ scaler_d.scale_[non_zero_ind]
)

# Assert shape of H_sparse
assert H.shape == (m, p), "Shape of H_sparse must be (m, p)"

return H

0 comments on commit e584bcd

Please sign in to comment.