diff --git a/.Rprofile b/.Rprofile
index 0201e1af9..4f507a9da 100644
--- a/.Rprofile
+++ b/.Rprofile
@@ -1,3 +1,7 @@
+if (requireNamespace("testthat", quietly = TRUE)) {
+ testthat::set_max_fails(Inf)
+}
+
#' Helper function for package development
#'
#' This is a manual extension of [testthat::snapshot_review()] which works for the \code{.rds} files used in
@@ -7,17 +11,19 @@
#' @param ... Additional arguments passed to [waldo::compare()]
#' Gives the relative path to the test files to review
#'
-snapshot_review_man <- function(path, tolerance = NULL, ...) {
- changed <- testthat:::snapshot_meta(path)
- these_rds <- (tools::file_ext(changed$name) == "rds")
- if (any(these_rds)) {
- for (i in which(these_rds)) {
- old <- readRDS(changed[i, "cur"])
- new <- readRDS(changed[i, "new"])
+snapshot_review_man <- function(path, tolerance = 10^(-5), max_diffs = 200, ...) {
+ if (requireNamespace("testthat", quietly = TRUE) && requireNamespace("waldo", quietly = TRUE)) {
+ changed <- testthat:::snapshot_meta(path)
+ these_rds <- (tools::file_ext(changed$name) == "rds")
+ if (any(these_rds)) {
+ for (i in which(these_rds)) {
+ old <- readRDS(changed[i, "cur"])
+ new <- readRDS(changed[i, "new"])
- cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
- print(waldo::compare(old, new, max_diffs = 50, tolerance = tolerance, ...))
- browser()
+ cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
+ print(waldo::compare(old, new, max_diffs = max_diffs, tolerance = tolerance, ...))
+ browser()
+ }
}
}
}
diff --git a/DESCRIPTION b/DESCRIPTION
index a823f1e19..0b4a11358 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -27,7 +27,7 @@ Encoding: UTF-8
LazyData: true
ByteCompile: true
Language: en-US
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
Depends: R (>= 3.5.0)
Imports:
stats,
@@ -40,7 +40,7 @@ Suggests:
ranger,
xgboost,
mgcv,
- testthat (>= 3.0.0),
+ testthat,
knitr,
rmarkdown,
roxygen2,
@@ -66,7 +66,8 @@ Suggests:
yardstick,
hardhat,
rsample,
- rlang
+ rlang,
+ cli
LinkingTo:
RcppArmadillo,
Rcpp
diff --git a/NAMESPACE b/NAMESPACE
index 1fa9bc343..e377010ac 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -52,15 +52,22 @@ S3method(setup_approach,regression_separate)
S3method(setup_approach,regression_surrogate)
S3method(setup_approach,timeseries)
S3method(setup_approach,vaeac)
+export(additional_regression_setup)
export(aicc_full_single_cpp)
+export(check_convergence)
+export(coalition_matrix_cpp)
+export(compute_estimates)
export(compute_shapley_new)
+export(compute_time)
export(compute_vS)
+export(compute_vS_forecast)
export(correction_matrix_cpp)
+export(create_coalition_table)
export(explain)
export(explain_forecast)
-export(feature_combinations)
-export(feature_matrix_cpp)
export(finalize_explanation)
+export(finalize_explanation_forecast)
+export(get_adaptive_arguments_default)
export(get_cov_mat)
export(get_data_specs)
export(get_model_specs)
@@ -75,17 +82,23 @@ export(predict_model)
export(prepare_data)
export(prepare_data_copula_cpp)
export(prepare_data_gaussian_cpp)
+export(prepare_next_iteration)
+export(print_iter)
export(regression.train_model)
export(rss_cpp)
+export(save_results)
export(setup)
export(setup_approach)
export(setup_computation)
+export(shapley_setup)
+export(testing_cleanup)
export(vaeac_get_evaluation_criteria)
export(vaeac_get_extra_para_default)
export(vaeac_plot_eval_crit)
export(vaeac_plot_imputed_ggpairs)
export(vaeac_train_model)
export(vaeac_train_model_continue)
+export(weight_matrix)
export(weight_matrix_cpp)
importFrom(Rcpp,sourceCpp)
importFrom(data.table,":=")
@@ -110,6 +123,7 @@ importFrom(stats,as.formula)
importFrom(stats,contrasts)
importFrom(stats,embed)
importFrom(stats,formula)
+importFrom(stats,median)
importFrom(stats,model.frame)
importFrom(stats,model.matrix)
importFrom(stats,predict)
@@ -118,6 +132,7 @@ importFrom(stats,qt)
importFrom(stats,rnorm)
importFrom(stats,sd)
importFrom(stats,setNames)
+importFrom(utils,capture.output)
importFrom(utils,head)
importFrom(utils,methods)
importFrom(utils,modifyList)
diff --git a/NEWS.md b/NEWS.md
index e5f8cb3d1..20c71de62 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,8 @@
-# shapr (development version)
+# shapr 1.0.0
-* Release a Python wrapper (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr`. The wrapper moves back and forth back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly. The wrapper is available [here](https://github.com/NorskRegnesentral/shapr/tree/master/python).
+* (Just some notes so far)
+* Adaptive estimatio/convergence detection
+* Verbosity
* Complete restructuring motivated by introducing the Python wrapper. The restructuring splits the explanation tasks into smaller pieces, which was necessary to allow the Python wrapper to move back and forth between R and Python.
* As part of the restructuring, we also did a number of design changes, resulting in a series of breaking changes described below.
@@ -13,6 +15,8 @@
### New features
+* Adatpive sampling of Shapley value subsets
+* Release a Python wrapper (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr`. The wrapper moves back and forth back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly. The wrapper is available [here](https://github.com/NorskRegnesentral/shapr/tree/master/python).
* Introduce batch computation of conditional expectations ([#244](https://github.com/NorskRegnesentral/shapr/issues/244)).
This essentially compute $v(S)$ for a portion of the $S$-subsets at a time, to reduce the amount of data needed to be held in memory.
The user can control the number of batches herself, but we set a reasonable value by default ([#327](https://github.com/NorskRegnesentral/shapr/pull/327)).
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 1f27325fe..d907eea50 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -110,7 +110,7 @@ inv_gaussian_transform_cpp <- function(z, x) {
#' Generate (Gaussian) Copula MC samples
#'
-#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_MC_samples`, `n_features`) containing samples from the
#' univariate standard normal.
#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
#' to explain on the original scale.
@@ -118,7 +118,7 @@ inv_gaussian_transform_cpp <- function(z, x) {
#' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
#' transformed to a standardized normal distribution.
#' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
-#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+#' @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
@@ -127,8 +127,8 @@ inv_gaussian_transform_cpp <- function(z, x) {
#' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been
#' transformed to a standardized normal distribution.
#'
-#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
-#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+#' @return An arma::cube/3D array of dimension (`n_MC_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+#' the columns (_,j,_) are matrices of dimension (`n_MC_samples`, `n_features`) containing the conditional Gaussian
#' copula MC samples for each explicand and coalition on the original scale.
#'
#' @export
@@ -140,19 +140,19 @@ prepare_data_copula_cpp <- function(MC_samples_mat, x_explain_mat, x_explain_gau
#' Generate Gaussian MC samples
#'
-#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_MC_samples`, `n_features`) containing samples from the
#' univariate standard normal.
#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
#' to explain.
-#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+#' @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature.
#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
#' between all pairs of features.
#'
-#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
-#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+#' @return An arma::cube/3D array of dimension (`n_MC_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+#' the columns (_,j,_) are matrices of dimension (`n_MC_samples`, `n_features`) containing the conditional Gaussian
#' MC samples for each explicand and coalition.
#'
#' @export
@@ -199,7 +199,7 @@ sample_features_cpp <- function(m, n_features) {
#'
#' @param xtest Numeric matrix. Represents a single test observation.
#'
-#' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
+#' @param S Integer matrix of dimension \code{n_coalitions x m}, where \code{n_coalitions} equals
#' the total number of sampled/non-sampled feature combinations and \code{m} equals
#' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
#' for more information.
@@ -228,34 +228,34 @@ observation_impute_cpp <- function(index_xtrain, index_s, xtrain, xtest, S) {
#' Calculate weight matrix
#'
-#' @param subsets List. Each of the elements equals an integer
+#' @param coalitions List. Each of the elements equals an integer
#' vector representing a valid combination of features/feature groups.
#' @param m Integer. Number of features/feature groups
#' @param n Integer. Number of combinations
#' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals
#' the Shapley weight of feature/feature group combination \code{i}, represented by
-#' \code{subsets[[i]]}.
+#' \code{coalitions[[i]]}.
#'
#' @export
#' @keywords internal
#'
#' @return Matrix of dimension n x m + 1
-#' @author Nikolai Sellereite
-weight_matrix_cpp <- function(subsets, m, n, w) {
- .Call(`_shapr_weight_matrix_cpp`, subsets, m, n, w)
+#' @author Nikolai Sellereite, Martin Jullum
+weight_matrix_cpp <- function(coalitions, m, n, w) {
+ .Call(`_shapr_weight_matrix_cpp`, coalitions, m, n, w)
}
-#' Get feature matrix
+#' Get coalition matrix
#'
-#' @param features List
-#' @param m Positive integer. Total number of features
+#' @param coalitions List
+#' @param m Positive integer. Total number of coalitions
#'
#' @export
#' @keywords internal
#'
#' @return Matrix
-#' @author Nikolai Sellereite
-feature_matrix_cpp <- function(features, m) {
- .Call(`_shapr_feature_matrix_cpp`, features, m)
+#' @author Nikolai Sellereite, Martin Jullum
+coalition_matrix_cpp <- function(coalitions, m) {
+ .Call(`_shapr_coalition_matrix_cpp`, coalitions, m)
}
diff --git a/R/approach.R b/R/approach.R
index e0325ea3d..2b08c454f 100644
--- a/R/approach.R
+++ b/R/approach.R
@@ -9,17 +9,49 @@
#'
#' @export
setup_approach <- function(internal, ...) {
+ verbose <- internal$parameters$verbose
+
approach <- internal$parameters$approach
- this_class <- ""
+ iter <- length(internal$iter_list)
+ X <- internal$iter_list[[iter]]$X
- if (length(approach) > 1) {
- class(this_class) <- "combined"
+
+
+ needs_X <- c("regression_surrogate", "vaeac")
+
+ run_now <- (isFALSE(any(needs_X %in% approach)) && isTRUE(is.null(X))) ||
+ (isTRUE(any(needs_X %in% approach)) && isFALSE(is.null(X)))
+
+ if (isFALSE(run_now)) { # Do nothing
+ return(internal)
} else {
- class(this_class) <- approach
- }
+ if ("progress" %in% verbose) {
+ cli::cli_progress_step("Setting up approach(es)")
+ }
+ if ("vS_details" %in% verbose) {
+ if ("vaeac" %in% approach) {
+ pretrained_provided <- internal$parameters$vaeac.extra_parameters$vaeac.pretrained_vaeac_model_provided
+ if (isFALSE(pretrained_provided)) {
+ cli::cli_h2("Extra info about the training/tuning of the vaeac model")
+ } else {
+ cli::cli_h2("Extra info about the pretrained vaeac model")
+ }
+ }
+ }
+
+ this_class <- ""
+
+ if (length(approach) > 1) {
+ class(this_class) <- "combined"
+ } else {
+ class(this_class) <- approach
+ }
+
+ UseMethod("setup_approach", this_class)
- UseMethod("setup_approach", this_class)
+ internal$timing_list$setup_approach <- Sys.time()
+ }
}
#' @inheritParams default_doc
@@ -49,6 +81,10 @@ setup_approach.combined <- function(internal, ...) {
#' @export
#' @keywords internal
prepare_data <- function(internal, index_features = NULL, ...) {
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+
# Extract the used approach(es)
approach <- internal$parameters$approach
@@ -57,9 +93,9 @@ prepare_data <- function(internal, index_features = NULL, ...) {
# Check if the user provided one or several approaches.
if (length(approach) > 1) {
- # Picks the relevant approach from the internal$objects$X table which list the unique approach of the batch
+ # Picks the relevant approach from the X table which list the unique approach of the batch
# matches by index_features
- class(this_class) <- internal$objects$X[id_combination == index_features[1], approach]
+ class(this_class) <- X[id_coalition == index_features[1], approach]
} else {
# Only one approach for all coalitions sizes
class(this_class) <- approach
diff --git a/R/approach_categorical.R b/R/approach_categorical.R
index f29ea07f2..42bdc158b 100644
--- a/R/approach_categorical.R
+++ b/R/approach_categorical.R
@@ -96,8 +96,11 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
joint_probability_dt <- internal$parameters$categorical.joint_prob_dt
- X <- internal$objects$X
- S <- internal$objects$S
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
+
if (is.null(index_features)) { # 2,3
features <- X$features # list of [1], [2], [2, 3]
@@ -106,9 +109,9 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
}
feature_names <- internal$parameters$feature_names
- # 3 id columns: id, id_combination, and id_all
+ # 3 id columns: id, id_coalition, and id_all
# id: for each x_explain observation
- # id_combination: the rows of the S matrix
+ # id_coalition: the rows of the S matrix
# id_all: identifies the unique combinations of feature values from
# the training data (not necessarily the ones in the explain data)
@@ -118,9 +121,9 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
S_dt <- data.table::data.table(S)
S_dt[S_dt == 0] <- NA
- S_dt[, id_combination := seq_len(nrow(S_dt))]
+ S_dt[, id_coalition := seq_len(nrow(S_dt))]
- data.table::setnames(S_dt, c(feature_conditioned, "id_combination"))
+ data.table::setnames(S_dt, c(feature_conditioned, "id_coalition"))
# (1) Compute marginal probabilities
@@ -153,13 +156,13 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
cond_dt <- j_S_all_feat[marg_dt, on = feature_conditioned]
cond_dt[, cond_prob := joint_prob / marg_prob]
- cond_dt[id_combination == 1, marg_prob := 0]
- cond_dt[id_combination == 1, cond_prob := 1]
+ cond_dt[id_coalition == 1, marg_prob := 0]
+ cond_dt[id_coalition == 1, cond_prob := 1]
# check marginal probabilities
cond_dt_unique <- unique(cond_dt, by = feature_conditioned)
- check <- cond_dt_unique[id_combination != 1][, .(sum_prob = sum(marg_prob)),
- by = "id_combination"
+ check <- cond_dt_unique[id_coalition != 1][, .(sum_prob = sum(marg_prob)),
+ by = "id_coalition"
][["sum_prob"]]
if (!all(round(check) == 1)) {
print("Warning - not all marginal probabilities sum to 1. There could be a problem
@@ -167,7 +170,7 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
}
# make x_explain
- data.table::setkeyv(cond_dt, c("id_combination", "id_all"))
+ data.table::setkeyv(cond_dt, c("id_coalition", "id_all"))
x_explain_with_id <- data.table::copy(x_explain)[, id := .I]
dt_just_explain <- cond_dt[x_explain_with_id, on = feature_names]
@@ -178,8 +181,8 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
dt <- cond_dt[dt_explain_just_conditioned, on = feature_conditioned, allow.cartesian = TRUE]
# check conditional probabilities
- check <- dt[id_combination != 1][, .(sum_prob = sum(cond_prob)),
- by = c("id_combination", "id")
+ check <- dt[id_coalition != 1][, .(sum_prob = sum(cond_prob)),
+ by = c("id_coalition", "id")
][["sum_prob"]]
if (!all(round(check) == 1)) {
print("Warning - not all conditional probabilities sum to 1. There could be a problem
@@ -187,13 +190,13 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
}
setnames(dt, "cond_prob", "w")
- data.table::setkeyv(dt, c("id_combination", "id"))
+ data.table::setkeyv(dt, c("id_coalition", "id"))
# here we merge so that we only return the combintations found in our actual explain data
# this merge does not change the number of rows in dt
- # dt <- merge(dt, x$X[, .(id_combination, n_features)], by = "id_combination")
+ # dt <- merge(dt, x$X[, .(id_coalition, n_features)], by = "id_coalition")
# dt[n_features %in% c(0, ncol(x_explain)), w := 1.0]
- dt[id_combination %in% c(1, 2^ncol(x_explain)), w := 1.0]
- ret_col <- c("id_combination", "id", feature_names, "w")
- return(dt[id_combination %in% index_features, mget(ret_col)])
+ dt[id_coalition %in% c(1, 2^ncol(x_explain)), w := 1.0]
+ ret_col <- c("id_coalition", "id", feature_names, "w")
+ return(dt[id_coalition %in% index_features, mget(ret_col)])
}
diff --git a/R/approach_copula.R b/R/approach_copula.R
index 4e7f5e914..8056d9f07 100644
--- a/R/approach_copula.R
+++ b/R/approach_copula.R
@@ -47,24 +47,27 @@ setup_approach.copula <- function(internal, ...) {
#' @author Lars Henry Berge Olsen
prepare_data.copula <- function(internal, index_features, ...) {
# Extract used variables
- S <- internal$objects$S[index_features, , drop = FALSE]
feature_names <- internal$parameters$feature_names
n_explain <- internal$parameters$n_explain
- n_samples <- internal$parameters$n_samples
+ n_MC_samples <- internal$parameters$n_MC_samples
n_features <- internal$parameters$n_features
- n_combinations_now <- length(index_features)
+ n_coalitions_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
copula.cov_mat <- internal$parameters$copula.cov_mat
copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian)
+ iter <- length(internal$iter_list)
+
+ S <- internal$iter_list[[iter]]$S[index_features, , drop = FALSE]
+
# Generate the MC samples from N(0, 1)
- MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features)
+ MC_samples_mat <- matrix(rnorm(n_MC_samples * n_features), nrow = n_MC_samples, ncol = n_features)
# Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands,
# and then transforming them back to the original scale using the inverse Gaussian transform in C++.
- # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features).
+ # The object `dt` is a 3D array of dimension (n_MC_samples, n_explain * n_coalitions, n_features).
dt <- prepare_data_copula_cpp(
MC_samples_mat = MC_samples_mat,
x_explain_mat = x_explain_mat,
@@ -75,17 +78,17 @@ prepare_data.copula <- function(internal, index_features, ...) {
cov_mat = copula.cov_mat
)
- # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
- dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
+ # Reshape `dt` to a 2D array of dimension (n_MC_samples * n_explain * n_coalitions, n_features).
+ dim(dt) <- c(n_coalitions_now * n_explain * n_MC_samples, n_features)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
- dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
- dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- dt[, w := 1 / n_samples]
- dt[, id_combination := index_features[id_combination]]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ dt[, id_coalition := rep(seq_len(nrow(S)), each = n_MC_samples * n_explain)]
+ dt[, id := rep(seq(n_explain), each = n_MC_samples, times = nrow(S))]
+ dt[, w := 1 / n_MC_samples]
+ dt[, id_coalition := index_features[id_coalition]]
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
return(dt)
}
diff --git a/R/approach_ctree.R b/R/approach_ctree.R
index 3c73c0d5a..86e8b5e97 100644
--- a/R/approach_ctree.R
+++ b/R/approach_ctree.R
@@ -12,13 +12,13 @@
#' Determines the minimum sum of weights in a terminal node required for a split
#'
#' @param ctree.sample Boolean. (default = TRUE)
-#' If TRUE, then the method always samples `n_samples` observations from the leaf nodes (with replacement).
-#' If FALSE and the number of observations in the leaf node is less than `n_samples`,
+#' If TRUE, then the method always samples `n_MC_samples` observations from the leaf nodes (with replacement).
+#' If FALSE and the number of observations in the leaf node is less than `n_MC_samples`,
#' the method will take all observations in the leaf.
-#' If FALSE and the number of observations in the leaf node is more than `n_samples`,
-#' the method will sample `n_samples` observations (with replacement).
+#' If FALSE and the number of observations in the leaf node is more than `n_MC_samples`,
+#' the method will sample `n_MC_samples` observations (with replacement).
#' This means that there will always be sampling in the leaf unless
-#' `sample` = FALSE AND the number of obs in the node is less than `n_samples`.
+#' `sample` = FALSE AND the number of obs in the node is less than `n_MC_samples`.
#'
#' @inheritParams default_doc_explain
#'
@@ -46,7 +46,7 @@ prepare_data.ctree <- function(internal, index_features = NULL, ...) {
x_train <- internal$data$x_train
x_explain <- internal$data$x_explain
n_explain <- internal$parameters$n_explain
- n_samples <- internal$parameters$n_samples
+ n_MC_samples <- internal$parameters$n_MC_samples
n_features <- internal$parameters$n_features
ctree.mincriterion <- internal$parameters$ctree.mincriterion
ctree.minsplit <- internal$parameters$ctree.minsplit
@@ -54,7 +54,9 @@ prepare_data.ctree <- function(internal, index_features = NULL, ...) {
ctree.sample <- internal$parameters$ctree.sample
labels <- internal$objects$feature_specs$labels
- X <- internal$objects$X
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
dt_l <- list()
@@ -81,24 +83,24 @@ prepare_data.ctree <- function(internal, index_features = NULL, ...) {
l <- lapply(
X = all_trees,
FUN = sample_ctree,
- n_samples = n_samples,
+ n_MC_samples = n_MC_samples,
x_explain = x_explain[i, , drop = FALSE],
x_train = x_train,
n_features = n_features,
sample = ctree.sample
)
- dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
- dt_l[[i]][, w := 1 / n_samples]
+ dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_coalition")
+ dt_l[[i]][, w := 1 / n_MC_samples]
dt_l[[i]][, id := i]
- if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
+ if (!is.null(index_features)) dt_l[[i]][, id_coalition := index_features[id_coalition]]
}
dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
- dt[id_combination %in% c(1, 2^n_features), w := 1.0]
+ dt[id_coalition %in% c(1, 2^n_features), w := 1.0]
# only return unique dt
- dt2 <- dt[, sum(w), by = c("id_combination", labels, "id")]
+ dt2 <- dt[, sum(w), by = c("id_coalition", labels, "id")]
setnames(dt2, "V1", "w")
return(dt2)
@@ -121,7 +123,7 @@ prepare_data.ctree <- function(internal, index_features = NULL, ...) {
#' @param minbucket Numeric scalar. (default = 7)
#' Determines the minimum sum of weights in a terminal node required for a split
#'
-#' @param use_partykit String. In some semi-rare cases `partyk::ctree` runs into an error related to the LINPACK
+#' @param use_partykit String. In some semi-rare cases `partykit::ctree` runs into an error related to the LINPACK
#' used by R. To get around this problem, one may fall back to using the newer (but slower) `partykit::ctree`
#' function, which is a reimplementation of the same method. Setting this parameter to `"on_error"` (default)
#' falls back to `partykit::ctree`, if `party::ctree` fails. Other options are `"never"`, which always
@@ -202,7 +204,7 @@ create_ctree <- function(given_ind,
#' @param tree List. Contains tree which is an object of type ctree built from the party package.
#' Also contains given_ind, the features to condition upon.
#'
-#' @param n_samples Numeric. Indicates how many samples to use for MCMC.
+#' @param n_MC_samples Numeric. Indicates how many samples to use for MCMC.
#'
#' @param x_explain Matrix, data.frame or data.table with the features of the observation whose
#' predictions ought to be explained (test data). Dimension `1\timesp` or `p\times1`.
@@ -213,15 +215,15 @@ create_ctree <- function(given_ind,
#'
#' @param sample Boolean. True indicates that the method samples from the terminal node
#' of the tree whereas False indicates that the method takes all the observations if it is
-#' less than n_samples.
+#' less than n_MC_samples.
#'
-#' @return data.table with `n_samples` (conditional) Gaussian samples
+#' @return data.table with `n_MC_samples` (conditional) Gaussian samples
#'
#' @keywords internal
#'
#' @author Annabelle Redelmeier
sample_ctree <- function(tree,
- n_samples,
+ n_MC_samples,
x_explain,
x_train,
n_features,
@@ -263,12 +265,12 @@ sample_ctree <- function(tree,
rowno <- seq_len(nrow(x_train))
- use_all_obs <- !sample & (length(rowno[fit.nodes == pred.nodes]) <= n_samples)
+ use_all_obs <- !sample & (length(rowno[fit.nodes == pred.nodes]) <= n_MC_samples)
if (use_all_obs) {
newrowno <- rowno[fit.nodes == pred.nodes]
} else {
- newrowno <- sample(rowno[fit.nodes == pred.nodes], n_samples,
+ newrowno <- sample(rowno[fit.nodes == pred.nodes], n_MC_samples,
replace = TRUE
)
}
diff --git a/R/approach_empirical.R b/R/approach_empirical.R
index 00f182807..cbf6a7c75 100644
--- a/R/approach_empirical.R
+++ b/R/approach_empirical.R
@@ -12,7 +12,7 @@
#' `eta` is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).
#'
#' @param empirical.fixed_sigma Positive numeric scalar. (default = 0.1)
-#' Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations.
+#' Represents the kernel bandwidth in the distance computation used when conditioning on all different coalitions.
#' Only used when `empirical.type = "fixed_sigma"`
#'
#' @param empirical.n_samples_aicc Positive integer. (default = 1000)
@@ -116,14 +116,17 @@ prepare_data.empirical <- function(internal, index_features = NULL, ...) {
x_explain <- internal$data$x_explain
empirical.cov_mat <- internal$parameters$empirical.cov_mat
- X <- internal$objects$X
- S <- internal$objects$S
+
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
n_explain <- internal$parameters$n_explain
empirical.type <- internal$parameters$empirical.type
empirical.eta <- internal$parameters$empirical.eta
empirical.fixed_sigma <- internal$parameters$empirical.fixed_sigma
- n_samples <- internal$parameters$n_samples
+ n_MC_samples <- internal$parameters$n_MC_samples
model <- internal$tmp$model
predict_model <- internal$tmp$predict_model
@@ -165,11 +168,11 @@ prepare_data.empirical <- function(internal, index_features = NULL, ...) {
x_train = as.matrix(x_train),
x_explain = as.matrix(x_explain[i, , drop = FALSE]),
empirical.eta = empirical.eta,
- n_samples = n_samples
+ n_MC_samples = n_MC_samples
)
dt_l[[i]][, id := i]
- if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
+ if (!is.null(index_features)) dt_l[[i]][, id_coalition := index_features[id_coalition]]
}
} else {
h_optim_mat <- matrix(NA, ncol = n_col, nrow = no_empirical)
@@ -214,11 +217,11 @@ prepare_data.empirical <- function(internal, index_features = NULL, ...) {
x_train = as.matrix(x_train),
x_explain = as.matrix(x_explain[i, , drop = FALSE]),
empirical.eta = empirical.eta,
- n_samples = n_samples
+ n_MC_samples = n_MC_samples
)
dt_l[[i]][, id := i]
- if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
+ if (!is.null(index_features)) dt_l[[i]][, id_coalition := index_features[id_coalition]]
}
}
@@ -235,9 +238,9 @@ prepare_data.empirical <- function(internal, index_features = NULL, ...) {
#' Generate permutations of training data using test observations
#'
#' @param W_kernel Numeric matrix. Contains all nonscaled weights between training and test
-#' observations for all feature combinations. The dimension equals `n_train x m`.
-#' @param S Integer matrix of dimension `n_combinations x m`, where `n_combinations`
-#' and `m` equals the total number of sampled/non-sampled feature combinations and
+#' observations for all coalitions. The dimension equals `n_train x m`.
+#' @param S Integer matrix of dimension `n_coalitions x m`, where `n_coalitions`
+#' and `m` equals the total number of sampled/non-sampled coalitions and
#' the total number of unique features, respectively. Note that `m = ncol(x_train)`.
#' @param x_train Numeric matrix
#' @param x_explain Numeric matrix
@@ -249,15 +252,15 @@ prepare_data.empirical <- function(internal, index_features = NULL, ...) {
#' @keywords internal
#'
#' @author Nikolai Sellereite
-observation_impute <- function(W_kernel, S, x_train, x_explain, empirical.eta = .7, n_samples = 1e3) {
+observation_impute <- function(W_kernel, S, x_train, x_explain, empirical.eta = .7, n_MC_samples = 1e3) {
# Check input
stopifnot(is.matrix(W_kernel) & is.matrix(S))
stopifnot(nrow(W_kernel) == nrow(x_train))
stopifnot(ncol(W_kernel) == nrow(S))
stopifnot(all(S %in% c(0, 1)))
- index_s <- index_x_train <- id_combination <- weight <- w <- wcum <- NULL # due to NSE notes in R CMD check
+ index_s <- index_x_train <- id_coalition <- weight <- w <- wcum <- NULL # due to NSE notes in R CMD check
- # Find weights for all combinations and training data
+ # Find weights for all coalitions and training data
dt <- data.table::as.data.table(W_kernel)
nms_vec <- seq_len(ncol(dt))
names(nms_vec) <- colnames(dt)
@@ -265,11 +268,11 @@ observation_impute <- function(W_kernel, S, x_train, x_explain, empirical.eta =
dt_melt <- data.table::melt(
dt,
id.vars = "index_x_train",
- variable.name = "id_combination",
+ variable.name = "id_coalition",
value.name = "weight",
variable.factor = FALSE
)
- dt_melt[, index_s := nms_vec[id_combination]]
+ dt_melt[, index_s := nms_vec[id_coalition]]
# Remove training data with small weight
knms <- c("index_s", "weight")
@@ -279,7 +282,7 @@ observation_impute <- function(W_kernel, S, x_train, x_explain, empirical.eta =
dt_melt[, wcum := cumsum(weight), by = "index_s"]
dt_melt <- dt_melt[wcum > 1 - empirical.eta][, wcum := NULL]
}
- dt_melt <- dt_melt[, tail(.SD, n_samples), by = "index_s"]
+ dt_melt <- dt_melt[, tail(.SD, n_MC_samples), by = "index_s"]
# Generate data used for prediction
dt_p <- observation_impute_cpp(
@@ -293,7 +296,7 @@ observation_impute <- function(W_kernel, S, x_train, x_explain, empirical.eta =
# Add keys
dt_p <- data.table::as.data.table(dt_p)
data.table::setnames(dt_p, colnames(x_train))
- dt_p[, id_combination := dt_melt[["index_s"]]]
+ dt_p[, id_coalition := dt_melt[["index_s"]]]
dt_p[, w := dt_melt[["weight"]]]
return(dt_p)
@@ -362,19 +365,22 @@ compute_AICc_each_k <- function(internal, model, predict_model, index_features)
n_train <- internal$parameters$n_train
n_explain <- internal$parameters$n_explain
empirical.n_samples_aicc <- internal$parameters$empirical.n_samples_aicc
- n_combinations <- internal$parameters$n_combinations
- n_features <- internal$parameters$n_features
+ n_shapley_values <- internal$parameters$n_shapley_values
labels <- internal$objects$feature_specs$labels
empirical.start_aicc <- internal$parameters$empirical.start_aicc
empirical.eval_max_aicc <- internal$parameters$empirical.eval_max_aicc
- X <- internal$objects$X
- S <- internal$objects$S
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
+
stopifnot(
data.table::is.data.table(X),
- !is.null(X[["id_combination"]]),
- !is.null(X[["n_features"]])
+ !is.null(X[["id_coalition"]]),
+ !is.null(X[["coalition_size"]])
)
optimsamp <- sample_combinations(
@@ -386,7 +392,7 @@ compute_AICc_each_k <- function(internal, model, predict_model, index_features)
empirical.n_samples_aicc <- nrow(optimsamp)
nloops <- n_explain # No of observations in test data
- h_optim_mat <- matrix(NA, ncol = n_features, nrow = n_combinations)
+ h_optim_mat <- matrix(NA, ncol = n_shapley_values, nrow = n_coalitions)
if (is.null(index_features)) {
index_features <- X[, .I]
@@ -394,10 +400,10 @@ compute_AICc_each_k <- function(internal, model, predict_model, index_features)
# Optimization is done only once for all distributions which conditions on
# exactly k variables
- these_k <- unique(X[, n_features[index_features]])
+ these_k <- unique(X[, coalition_size[index_features]])
for (i in these_k) {
- these_cond <- X[index_features][n_features == i, id_combination]
+ these_cond <- X[index_features][coalition_size == i, id_coalition]
cutters <- seq_len(empirical.n_samples_aicc)
no_cond <- length(these_cond)
cond_samp <- cut(
@@ -477,14 +483,16 @@ compute_AICc_full <- function(internal, model, predict_model, index_features) {
n_train <- internal$parameters$n_train
n_explain <- internal$parameters$n_explain
empirical.n_samples_aicc <- internal$parameters$empirical.n_samples_aicc
- n_combinations <- internal$parameters$n_combinations
- n_features <- internal$parameters$n_features
+ n_shapley_values <- internal$parameters$n_shapley_values
labels <- internal$objects$feature_specs$labels
empirical.start_aicc <- internal$parameters$empirical.start_aicc
empirical.eval_max_aicc <- internal$parameters$empirical.eval_max_aicc
- X <- internal$objects$X
- S <- internal$objects$S
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
ntest <- n_explain
@@ -500,7 +508,7 @@ compute_AICc_full <- function(internal, model, predict_model, index_features) {
)
nloops <- n_explain # No of observations in test data
- h_optim_mat <- matrix(NA, ncol = n_features, nrow = n_combinations)
+ h_optim_mat <- matrix(NA, ncol = n_shapley_values, nrow = n_coalitions)
if (is.null(index_features)) {
index_features <- X[, .I]
diff --git a/R/approach_gaussian.R b/R/approach_gaussian.R
index 23dd34d98..9c816bc9d 100644
--- a/R/approach_gaussian.R
+++ b/R/approach_gaussian.R
@@ -51,21 +51,24 @@ setup_approach.gaussian <- function(internal,
#' @author Lars Henry Berge Olsen
prepare_data.gaussian <- function(internal, index_features, ...) {
# Extract used variables
- S <- internal$objects$S[index_features, , drop = FALSE]
feature_names <- internal$parameters$feature_names
n_explain <- internal$parameters$n_explain
n_features <- internal$parameters$n_features
- n_samples <- internal$parameters$n_samples
- n_combinations_now <- length(index_features)
+ n_MC_samples <- internal$parameters$n_MC_samples
+ n_coalitions_now <- length(index_features)
x_explain_mat <- as.matrix(internal$data$x_explain)
mu <- internal$parameters$gaussian.mu
cov_mat <- internal$parameters$gaussian.cov_mat
+ iter <- length(internal$iter_list)
+
+ S <- internal$iter_list[[iter]]$S[index_features, , drop = FALSE]
+
# Generate the MC samples from N(0, 1)
- MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features)
+ MC_samples_mat <- matrix(rnorm(n_MC_samples * n_features), nrow = n_MC_samples, ncol = n_features)
# Use Cpp to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}) for all coalitions and explicands.
- # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features).
+ # The object `dt` is a 3D array of dimension (n_MC_samples, n_explain * n_coalitions, n_features).
dt <- prepare_data_gaussian_cpp(
MC_samples_mat = MC_samples_mat,
x_explain_mat = x_explain_mat,
@@ -74,17 +77,17 @@ prepare_data.gaussian <- function(internal, index_features, ...) {
cov_mat = cov_mat
)
- # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
- dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
+ # Reshape `dt` to a 2D array of dimension (n_MC_samples * n_explain * n_coalitions, n_features).
+ dim(dt) <- c(n_coalitions_now * n_explain * n_MC_samples, n_features)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
- dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
- dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- dt[, w := 1 / n_samples]
- dt[, id_combination := index_features[id_combination]]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ dt[, id_coalition := rep(seq_len(nrow(S)), each = n_MC_samples * n_explain)]
+ dt[, id := rep(seq(n_explain), each = n_MC_samples, times = nrow(S))]
+ dt[, w := 1 / n_MC_samples]
+ dt[, id_coalition := index_features[id_coalition]]
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
return(dt)
}
diff --git a/R/approach_independence.R b/R/approach_independence.R
index ba45b7e4b..7effc7f4e 100644
--- a/R/approach_independence.R
+++ b/R/approach_independence.R
@@ -20,19 +20,21 @@ prepare_data.independence <- function(internal, index_features = NULL, ...) {
# Extract relevant parameters
feature_specs <- internal$objects$feature_specs
- n_samples <- internal$parameters$n_samples
+ n_MC_samples <- internal$parameters$n_MC_samples
n_train <- internal$parameters$n_train
n_explain <- internal$parameters$n_explain
- X <- internal$objects$X
- S <- internal$objects$S
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
if (is.null(index_features)) {
- # Use all feature combinations/coalitions (only applies if a single approach is used)
+ # Use all coalitions (only applies if a single approach is used)
index_features <- X[, .I]
}
- # Extract the relevant feature combinations/coalitions
+ # Extract the relevant coalitions
# Set `drop = FALSE` to ensure that `S0` is a matrix.
S0 <- S[index_features, , drop = FALSE]
@@ -65,10 +67,10 @@ prepare_data.independence <- function(internal, index_features = NULL, ...) {
x_explain0_mat <- as.matrix(x_explain0)
# Get coalition indices.
- # We repeat each coalition index `min(n_samples, n_train)` times. We use `min`
- # as we cannot sample `n_samples` unique indices if `n_train` is less than `n_samples`.
- index_s <- rep(seq_len(nrow(S0)), each = min(n_samples, n_train))
- w0 <- 1 / min(n_samples, n_train) # The inverse of the number of samples being used in practice
+ # We repeat each coalition index `min(n_MC_samples, n_train)` times. We use `min`
+ # as we cannot sample `n_MC_samples` unique indices if `n_train` is less than `n_MC_samples`.
+ index_s <- rep(seq_len(nrow(S0)), each = min(n_MC_samples, n_train))
+ w0 <- 1 / min(n_MC_samples, n_train) # The inverse of the number of samples being used in practice
# Creat a list to store the MC samples, where ith entry is associated with ith explicand
dt_l <- list()
@@ -80,7 +82,7 @@ prepare_data.independence <- function(internal, index_features = NULL, ...) {
# Sample the indices of the training observations we are going to splice the explicand with
# and replicate these indices by the number of coalitions.
- index_xtrain <- c(replicate(nrow(S0), sample(x = seq(n_train), size = min(n_samples, n_train), replace = FALSE)))
+ index_xtrain <- c(replicate(nrow(S0), sample(x = seq(n_train), size = min(n_MC_samples, n_train), replace = FALSE)))
# Generate data used for prediction. This splices the explicand with
# the other sampled training observations for all relevant coalitions.
@@ -95,7 +97,7 @@ prepare_data.independence <- function(internal, index_features = NULL, ...) {
# Add keys
dt_l[[i]] <- data.table::as.data.table(dt_p)
data.table::setnames(dt_l[[i]], feature_specs$labels)
- dt_l[[i]][, id_combination := index_features[index_s]]
+ dt_l[[i]][, id_coalition := index_features[index_s]]
dt_l[[i]][, w := w0]
dt_l[[i]][, id := i]
}
diff --git a/R/approach_regression_separate.R b/R/approach_regression_separate.R
index 7104db548..d7dece122 100644
--- a/R/approach_regression_separate.R
+++ b/R/approach_regression_separate.R
@@ -11,8 +11,8 @@
#' The data.frame must contain the possible hyperparameter value combinations to try.
#' The column names must match the names of the tuneable parameters specified in `regression.model`.
#' If `regression.tune_values` is a function, then it should take one argument `x` which is the training data
-#' for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-#' Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+#' for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+#' Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
#' vignette for several examples.
#' Note, to make it easier to call `explain()` from Python, the `regression.tune_values` can also be a string
#' containing an R function. For example,
@@ -42,8 +42,6 @@ setup_approach.regression_separate <- function(internal,
regression.check_namespaces()
# Small printout to the user
- if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_separate'.")
- if (internal$parameters$verbose == 2) regression.separate_time_mess() # TODO: maybe remove
# Add the default parameter values for the non-user specified parameters for the separate regression approach
defaults <-
@@ -54,7 +52,6 @@ setup_approach.regression_separate <- function(internal,
internal <- regression.check_parameters(internal = internal)
# Small printout to the user
- if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_separate'.")
return(internal) # Return the updated internal list
}
@@ -67,38 +64,42 @@ prepare_data.regression_separate <- function(internal, index_features = NULL, ..
# Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above.
requireNamespace("workflows", quietly = TRUE)
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ verbose <- internal$parameters$verbose
+
# Get the features in the batch
- features <- internal$objects$X$features[index_features]
+ features <- X$features[index_features]
- # Small printout to the user about which batch that are currently worked on
- if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features)
- # Initialize empty data table with specific column names and id_combination (transformed to integer later). The data
+ # Initialize empty data table with specific column names and id_coalition (transformed to integer later). The data
# table will contain the contribution function values for the coalitions given by `index_features` and all explicands.
- dt_res_column_names <- c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain)))
+ dt_res_column_names <- c("id_coalition", paste0("p_hat1_", seq_len(internal$parameters$n_explain)))
dt_res <- data.table(matrix(ncol = length(dt_res_column_names), nrow = 0, dimnames = list(NULL, dt_res_column_names)))
# Iterate over the coalitions provided by index_features.
# Note that index_features will never be NULL and never contain the empty or grand coalitions.
for (comb_idx in seq_along(features)) {
- # Get the column indices of the features in current coalition/combination
+ # Get the column indices of the features in current coalition
current_comb <- features[[comb_idx]]
# Extract the current training (and add y_hat as response) and explain data
current_x_train <- internal$data$x_train[, ..current_comb][, "y_hat" := internal$data$x_train_y_hat]
current_x_explain <- internal$data$x_explain[, ..current_comb]
+
# Fit the current separate regression model to the current training data
- if (internal$parameters$verbose == 2) regression.prep_message_comb(internal, index_features, comb_idx)
regression.current_fit <- regression.train_model(
x = current_x_train,
seed = internal$parameters$seed,
- verbose = internal$parameters$verbose,
+ verbose = verbose,
regression.model = internal$parameters$regression.model,
regression.tune = internal$parameters$regression.tune,
regression.tune_values = internal$parameters$regression.tune_values,
regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para,
- regression.recipe_func = internal$parameters$regression.recipe_func
+ regression.recipe_func = internal$parameters$regression.recipe_func,
+ current_comb = current_comb
)
# Compute the predicted response for the explicands, i.e., the v(S, x_i) for all explicands x_i.
@@ -108,9 +109,9 @@ prepare_data.regression_separate <- function(internal, index_features = NULL, ..
dt_res <- rbind(dt_res, data.table(index_features[comb_idx], matrix(pred_explicand, nrow = 1)), use.names = FALSE)
}
- # Set id_combination to be the key
- dt_res[, id_combination := as.integer(id_combination)]
- data.table::setkey(dt_res, id_combination)
+ # Set id_coalition to be the key
+ dt_res[, id_coalition := as.integer(id_coalition)]
+ data.table::setkey(dt_res, id_coalition)
# Return the estimated contribution function values
return(dt_res)
@@ -139,14 +140,15 @@ prepare_data.regression_separate <- function(internal, index_features = NULL, ..
#' @keywords internal
regression.train_model <- function(x,
seed = 1,
- verbose = 0,
+ verbose = NULL,
regression.model = parsnip::linear_reg(),
regression.tune = FALSE,
regression.tune_values = NULL,
regression.vfold_cv_para = NULL,
regression.recipe_func = NULL,
regression.response_var = "y_hat",
- regression.surrogate_n_comb = NULL) {
+ regression.surrogate_n_comb = NULL,
+ current_comb = NULL) {
# Create a recipe to the augmented training data
regression.recipe <- recipes::recipe(as.formula(paste(regression.response_var, "~ .")), data = x)
@@ -203,9 +205,14 @@ regression.train_model <- function(x,
grid = regression.grid,
metrics = yardstick::metric_set(yardstick::rmse)
)
-
# Small printout to the user
- if (verbose == 2) regression.cv_message(regression.results = regression.results, regression.grid = regression.grid)
+ if ("vS_details" %in% verbose) {
+ regression.cv_message(
+ regression.results = regression.results,
+ regression.grid = regression.grid,
+ current_comb = current_comb
+ )
+ }
# Set seed for reproducibility. Without this we get different results based on if we run in parallel or sequential
set.seed(seed)
@@ -320,6 +327,11 @@ regression.get_tune <- function(regression.model, regression.tune_values, x_trai
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.check_parameters <- function(internal) {
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+
+
# Convert the objects to R-objects if they are strings
if (is.character(internal$parameters$regression.model)) {
internal$parameters$regression.model <- regression.get_string_to_R(internal$parameters$regression.model)
@@ -343,7 +355,7 @@ regression.check_parameters <- function(internal) {
# Check that `regression.check_sur_n_comb` is a valid value (only applicable for surrogate regression)
regression.check_sur_n_comb(
regression.surrogate_n_comb = internal$parameters$regression.surrogate_n_comb,
- used_n_combinations = internal$parameters$used_n_combinations
+ n_coalitions = n_coalitions
)
# Check and get if we are to tune the hyperparameters of the regression model
@@ -431,43 +443,6 @@ regression.check_namespaces <- function() {
}
}
-# Message functions ====================================================================================================
-#' Produce time message for separate regression
-#' @author Lars Henry Berge Olsen
-#' @keywords internal
-regression.separate_time_mess <- function() {
- message(paste(
- "When using `approach = 'regression_separate'` the `explanation$timing$timing_secs` object \n",
- "can be missleading as `setup_computation` does not contain the training times of the \n",
- "regression models as they are trained on the fly in `compute_vS`. This is to reduce memory \n",
- "usage and to improve efficency.\n"
- )) # TODO: should we add the time somewhere else?
-}
-
-#' Produce message about which batch prepare_data is working on
-#' @inheritParams default_doc
-#' @inheritParams default_doc_explain
-#' @author Lars Henry Berge Olsen
-#' @keywords internal
-regression.prep_message_batch <- function(internal, index_features) {
- message(paste0(
- "Working on batch ", internal$objects$X[id_combination == index_features[1]]$batch, " of ",
- internal$parameters$n_batches, " in `prepare_data.", internal$parameters$approach, "()`."
- ))
-}
-
-#' Produce message about which combination prepare_data is working on
-#' @inheritParams default_doc
-#' @inheritParams default_doc_explain
-#' @param comb_idx Integer. The index of the combination in a specific batch.
-#' @author Lars Henry Berge Olsen
-#' @keywords internal
-regression.prep_message_comb <- function(internal, index_features, comb_idx) {
- message(paste0(
- "Working on combination with id ", internal$objects$X$id_combination[index_features[comb_idx]],
- " of ", internal$parameters$used_n_combinations, "."
- ))
-}
#' Produce message about which batch prepare_data is working on
#'
@@ -477,7 +452,7 @@ regression.prep_message_comb <- function(internal, index_features, comb_idx) {
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
-regression.cv_message <- function(regression.results, regression.grid, n_cv = 10) {
+regression.cv_message <- function(regression.results, regression.grid, n_cv = 10, current_comb) {
# Get the feature names and add evaluation metric rmse
feature_names <- names(regression.grid)
feature_names_rmse <- c(feature_names, "rmse", "rmse_std_err")
@@ -494,8 +469,16 @@ regression.cv_message <- function(regression.results, regression.grid, n_cv = 10
regression.grid_best$rmse_std <- round(best_results$std_err, 2)
width <- sapply(regression.grid_best, function(x) max(nchar(as.character(unique(x)))))
- # Message title of the results
- message(paste0("Results of the ", best_results$n[1], "-fold cross validation (top ", n_cv, " best configurations):"))
+ # Regression_separate adds the v(S), while separate does not add anything, but prints the Extra info thing
+ if (!is.null(current_comb)) {
+ this_vS <- paste0("for v(", paste0(current_comb, collapse = " "), ") ")
+ } else {
+ cli::cli_h2("Extra info about the tuning of the regression model")
+ this_vS <- ""
+ }
+
+ msg0 <- paste0("Top ", n_cv, " best configs ", this_vS, "(using ", best_results$n[1], "-fold CV)")
+ msg <- NULL
# Iterate over the n_cv best results and print out the hyper parameter values and the rmse and rmse_std_err
for (row_idx in seq_len(nrow(best_results))) {
@@ -509,8 +492,13 @@ regression.cv_message <- function(regression.results, regression.grid, n_cv = 10
seq_along(feature_values_rmse),
function(x) format(as.character(feature_values_rmse[x]), width = width[x], justify = "left")
)
- message(paste0("#", row_idx, ": ", paste(paste(feature_names_rmse, "=", values_fixed_len), collapse = " "), ""))
+ msg <- c(msg, paste0(
+ "#", row_idx, ": ", paste(paste(feature_names_rmse, "=", values_fixed_len), collapse = " "),
+ "\n"
+ ))
}
-
- message("") # Empty message to get a blank line
+ cli::cli({
+ cli::cli_h3(msg0)
+ for (i in seq_along(msg)) cli::cli_text(msg[i])
+ })
}
diff --git a/R/approach_regression_surrogate.R b/R/approach_regression_surrogate.R
index a61890694..845daaa23 100644
--- a/R/approach_regression_surrogate.R
+++ b/R/approach_regression_surrogate.R
@@ -3,13 +3,17 @@
#'
#' @inheritParams default_doc_explain
#' @inheritParams setup_approach.regression_separate
-#' @param regression.surrogate_n_comb Integer (default is `internal$parameters$used_n_combinations`) specifying the
-#' number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
-#' "`internal$parameters$used_n_combinations` - 2". By default, we use all coalitions, but this can take a lot of memory
-#' in larger dimensions. Note that by "all", we mean all coalitions chosen by `shapr` to be used. This will be all
-#' \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if `shapr` is in the exact mode. If the
-#' user sets a lower value than `internal$parameters$used_n_combinations`, then we sample this amount of unique
-#' coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.
+#' @param regression.surrogate_n_comb Integer.
+#' (default is `internal$iter_list[[length(internal$iter_list)]]$n_coalitions`) specifying the
+#' number of unique coalitions to apply to each training observation. Maximum allowed value is
+#' "`internal$iter_list[[length(internal$iter_list)]]$n_coalitions` - 2".
+#' By default, we use all coalitions, but this can take a lot of memory in larger dimensions.
+#' Note that by "all", we mean all coalitions chosen by `shapr` to be used.
+#' This will be all \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if `shapr` is in
+#' the exact mode.
+#' If the user sets a lower value than `internal$iter_list[[length(internal$iter_list)]]$n_coalitions`,
+#' then we sample this amount of unique coalitions separately for each training observations.
+#' That is, on average, all coalitions should be equally trained.
#'
#' @export
#' @author Lars Henry Berge Olsen
@@ -19,13 +23,14 @@ setup_approach.regression_surrogate <- function(internal,
regression.vfold_cv_para = NULL,
regression.recipe_func = NULL,
regression.surrogate_n_comb =
- internal$parameters$used_n_combinations - 2,
+ internal$iter_list[[length(internal$iter_list)]]$n_coalitions - 2,
...) {
+ verbose <- internal$parameters$verbose
+
# Check that required libraries are installed
regression.check_namespaces()
- # Small printout to the user
- if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_surrogate'.")
+
# Add the default parameter values for the non-user specified parameters for the separate regression approach
defaults <- mget(c(
@@ -43,11 +48,10 @@ setup_approach.regression_surrogate <- function(internal,
)
# Fit the surrogate regression model and store it in the internal list
- if (internal$parameters$verbose == 2) message("Start training the surrogate model.")
internal$objects$regression.surrogate_model <- regression.train_model(
x = x_train_augmented,
seed = internal$parameters$seed,
- verbose = internal$parameters$verbose,
+ verbose = verbose,
regression.model = internal$parameters$regression.model,
regression.tune = internal$parameters$regression.tune,
regression.tune_values = internal$parameters$regression.tune_values,
@@ -56,8 +60,6 @@ setup_approach.regression_surrogate <- function(internal,
regression.surrogate_n_comb = regression.surrogate_n_comb + 1 # Add 1 as augment_include_grand = TRUE above
)
- # Small printout to the user
- if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_surrogate'.")
return(internal) # Return the updated internal list
}
@@ -70,8 +72,6 @@ prepare_data.regression_surrogate <- function(internal, index_features = NULL, .
# Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above.
requireNamespace("workflows", quietly = TRUE)
- # Small printout to the user about which batch that are currently worked on
- if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features)
# Augment the explicand data
x_explain_aug <- regression.surrogate_aug_data(internal, x = internal$data$x_explain, index_features = index_features)
@@ -81,8 +81,8 @@ prepare_data.regression_surrogate <- function(internal, index_features = NULL, .
# Insert the predicted contribution functions values into a data table of the correct setup
dt_res <- data.table(as.integer(index_features), matrix(pred_explicand, nrow = length(index_features)))
- data.table::setnames(dt_res, c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain))))
- data.table::setkey(dt_res, id_combination) # Set id_combination to be the key
+ data.table::setnames(dt_res, c("id_coalition", paste0("p_hat1_", seq_len(internal$parameters$n_explain))))
+ data.table::setkey(dt_res, id_coalition) # Set id_coalition to be the key
return(dt_res)
}
@@ -95,21 +95,21 @@ prepare_data.regression_surrogate <- function(internal, index_features = NULL, .
#' @param y_hat Vector of numerics (optional) containing the predicted responses for the observations in `x`.
#' @param index_features Array of integers (optional) containing which coalitions to consider. Must be provided if
#' `x` is the explicands.
-#' @param augment_add_id_comb Logical (default is `FALSE`). If `TRUE`, an additional column is adding containing
+#' @param augment_add_id_coal Logical (default is `FALSE`). If `TRUE`, an additional column is adding containing
#' which coalition was applied.
#' @param augment_include_grand Logical (default is `FALSE`). If `TRUE`, then the grand coalition is included.
#' If `index_features` are provided, then `augment_include_grand` has no effect. Note that if we sample the
-#' combinations then the grand coalition is equally likely to be samples as the other coalitions (or weighted if
+#' coalitions then the grand coalition is equally likely to be samples as the other coalitions (or weighted if
#' `augment_comb_prob` is provided).
#' @param augment_masks_as_factor Logical (default is `FALSE`). If `TRUE`, then the binary masks are converted
#' to factors. If `FALSE`, then the binary masks are numerics.
#' @param augment_comb_prob Array of numerics (default is `NULL`). The length of the array must match the number of
-#' combinations being considered, where each entry specifies the probability of sampling the corresponding coalition.
+#' coalitions being considered, where each entry specifies the probability of sampling the corresponding coalition.
#' This is useful if we want to generate more training data for some specific coalitions. One possible choice would be
-#' `augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_combinations] else NULL`.
+#' `augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_coalitions] else NULL`.
#' @param augment_weights String (optional). Specifying which type of weights to add to the observations.
#' If `NULL` (default), then no weights are added. If `"Shapley"`, then the Shapley weights for the different
-#' combinations are added to corresponding observations where the coalitions was applied. If `uniform`, then
+#' coalitions are added to corresponding observations where the coalitions was applied. If `uniform`, then
#' all observations get an equal weight of one.
#'
#' @return A data.table containing the augmented data.
@@ -121,25 +121,28 @@ regression.surrogate_aug_data <- function(internal,
index_features = NULL,
augment_masks_as_factor = FALSE,
augment_include_grand = FALSE,
- augment_add_id_comb = FALSE,
+ augment_add_id_coal = FALSE,
augment_comb_prob = NULL,
augment_weights = NULL) {
+ iter <- length(internal$iter_list)
+
# Get some of the parameters
- S <- internal$objects$S
- actual_n_combinations <- internal$parameters$used_n_combinations - 2 # Remove empty and grand coalitions
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
+ actual_n_coalitions <- internal$iter_list[[iter]]$n_coalitions - 2 # Remove empty and grand coalitions
regression.surrogate_n_comb <- internal$parameters$regression.surrogate_n_comb
if (!is.null(index_features)) regression.surrogate_n_comb <- length(index_features) # Applicable from prep_data()
if (augment_include_grand) {
- actual_n_combinations <- actual_n_combinations + 1 # Add 1 to include the grand comb
+ actual_n_coalitions <- actual_n_coalitions + 1 # Add 1 to include the grand comb
regression.surrogate_n_comb <- regression.surrogate_n_comb + 1
}
- if (regression.surrogate_n_comb > actual_n_combinations) regression.surrogate_n_comb <- actual_n_combinations
+ if (regression.surrogate_n_comb > actual_n_coalitions) regression.surrogate_n_comb <- actual_n_coalitions
# Small checks
if (!is.null(augment_weights)) augment_weights <- match.arg(augment_weights, c("Shapley", "uniform"))
- if (!is.null(augment_comb_prob) && length(augment_comb_prob) != actual_n_combinations) {
- stop(paste("`augment_comb_prob` must be of length", actual_n_combinations, "."))
+ if (!is.null(augment_comb_prob) && length(augment_comb_prob) != actual_n_coalitions) {
+ stop(paste("`augment_comb_prob` must be of length", actual_n_coalitions, "."))
}
if (!is.null(augment_weights) && augment_include_grand && augment_weights == "Shapley") {
@@ -164,11 +167,11 @@ regression.surrogate_aug_data <- function(internal,
# Check if we are to augment the training data or the explicands
if (is.null(index_features)) {
# Training: get matrix (n_obs x regression.surrogate_n_comb) containing the indices of the active coalitions
- if (regression.surrogate_n_comb >= actual_n_combinations) { # Start from two to exclude the empty set
- comb_active_idx <- matrix(rep(seq(2, actual_n_combinations + 1), times = n_obs), ncol = n_obs)
+ if (regression.surrogate_n_comb >= actual_n_coalitions) { # Start from two to exclude the empty set
+ comb_active_idx <- matrix(rep(seq(2, actual_n_coalitions + 1), times = n_obs), ncol = n_obs)
} else {
comb_active_idx <- sapply(seq(n_obs), function(x) { # Add 1 as we want to exclude the empty set
- sample.int(n = actual_n_combinations, size = regression.surrogate_n_comb, prob = augment_comb_prob) + 1
+ sample.int(n = actual_n_coalitions, size = regression.surrogate_n_comb, prob = augment_comb_prob) + 1
})
}
} else {
@@ -178,8 +181,8 @@ regression.surrogate_aug_data <- function(internal,
# Extract the active coalitions for each explicand. The number of rows are n_obs * n_comb_per_explicands,
# where the first n_comb_per_explicands rows are connected to the first explicand and so on. Set the column names.
- id_comb <- as.vector(comb_active_idx)
- comb_active <- S[id_comb, , drop = FALSE]
+ id_coal <- as.vector(comb_active_idx)
+ comb_active <- S[id_coal, , drop = FALSE]
colnames(comb_active) <- names(feature_classes)
# Repeat the feature values as many times as there are active coalitions
@@ -209,11 +212,11 @@ regression.surrogate_aug_data <- function(internal,
# Add either uniform weights or Shapley kernel weights
if (!is.null(augment_weights)) {
- x_augmented[, "weight" := if (augment_weights == "Shapley") internal$objects$X$shapley_weight[id_comb] else 1]
+ x_augmented[, "weight" := if (augment_weights == "Shapley") X$shapley_weight[id_coal] else 1]
}
- # Add the id_comb as a factor
- if (augment_add_id_comb) x_augmented[, "id_comb" := factor(id_comb)]
+ # Add the id_coal as a factor
+ if (augment_add_id_coal) x_augmented[, "id_coal" := factor(id_coal)]
# Add repeated responses if provided
if (!is.null(y_hat)) x_augmented[, "y_hat" := rep(y_hat, each = regression.surrogate_n_comb)]
@@ -229,16 +232,16 @@ regression.surrogate_aug_data <- function(internal,
#' Check that `regression.surrogate_n_comb` is either NULL or a valid integer.
#'
#' @inheritParams setup_approach.regression_surrogate
-#' @param used_n_combinations Integer. The number of used combinations (including the empty and grand coalitions).
+#' @param n_coalitions Integer. The number of used coalitions (including the empty and grand coalition).
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
-regression.check_sur_n_comb <- function(regression.surrogate_n_comb, used_n_combinations) {
+regression.check_sur_n_comb <- function(regression.surrogate_n_comb, n_coalitions) {
if (!is.null(regression.surrogate_n_comb)) {
- if (regression.surrogate_n_comb < 1 || used_n_combinations - 2 < regression.surrogate_n_comb) {
+ if (regression.surrogate_n_comb < 1 || n_coalitions - 2 < regression.surrogate_n_comb) {
stop(paste0(
"`regression.surrogate_n_comb` (", regression.surrogate_n_comb, ") must be a positive integer less than or ",
- "equal to `used_n_combinations` minus two (", used_n_combinations - 2, ")."
+ "equal to `n_coalitions` minus two (", n_coalitions - 2, ")."
))
}
}
diff --git a/R/approach_timeseries.R b/R/approach_timeseries.R
index 09f9fc113..1ccf5fa35 100644
--- a/R/approach_timeseries.R
+++ b/R/approach_timeseries.R
@@ -39,7 +39,7 @@ setup_approach.timeseries <- function(internal,
#' @export
#' @keywords internal
prepare_data.timeseries <- function(internal, index_features = NULL, ...) {
- id <- id_combination <- w <- NULL
+ id <- id_coalition <- w <- NULL
x_train <- internal$data$x_train
x_explain <- internal$data$x_explain
@@ -48,8 +48,10 @@ prepare_data.timeseries <- function(internal, index_features = NULL, ...) {
timeseries.upper_bound <- internal$parameters$timeseries.bounds[1]
timeseries.lower_bound <- internal$parameters$timeseries.bounds[2]
- X <- internal$objects$X
- S <- internal$objects$S
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
if (is.null(index_features)) {
features <- X$features
@@ -134,12 +136,12 @@ prepare_data.timeseries <- function(internal, index_features = NULL, ...) {
names(tmp[[j]]) <- names(tmp[[1]])
}
- dt_l[[i]] <- rbindlist(tmp, idcol = "id_combination")
- # dt_l[[i]][, w := 1 / .N, by = id_combination] # IS THIS NECESSARY?
+ dt_l[[i]] <- rbindlist(tmp, idcol = "id_coalition")
+ # dt_l[[i]][, w := 1 / .N, by = id_coalition] # IS THIS NECESSARY?
dt_l[[i]][, id := i]
}
dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
- ret_col <- c("id_combination", "id", feature_names, "w")
- return(dt[id_combination %in% index_features, mget(ret_col)])
+ ret_col <- c("id_coalition", "id", feature_names, "w")
+ return(dt[id_coalition %in% index_features, mget(ret_col)])
}
diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R
index 4ba03ba20..0c7a812a5 100644
--- a/R/approach_vaeac.R
+++ b/R/approach_vaeac.R
@@ -31,6 +31,8 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
vaeac.epochs = 100,
vaeac.extra_parameters = list(),
...) {
+ verbose <- internal$parameters$verbose
+
# Check that torch is installed
if (!requireNamespace("torch", quietly = TRUE)) {
stop("`torch` is not installed. Please run `install.packages('torch')`.")
@@ -38,12 +40,12 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
if (!torch::torch_is_installed()) stop("`torch` is not properly installed. Please run `torch::install_torch()`.")
# Extract the objects we will use later
- S <- internal$objects$S
- X <- internal$objects$X
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+ S <- internal$iter_list[[iter]]$S
parameters <- internal$parameters
- # Small printout to user
- if (parameters$verbose == 2) message("Setting up the `vaeac` approach.")
# Check if we are doing a combination of approaches
combined_approaches <- length(parameters$approach) > 1
@@ -76,18 +78,18 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
# Check if vaeac is to be applied on a subset of coalitions.
if (!parameters$exact || parameters$is_groupwise || combined_approaches) {
# We have either:
- # 1) sampled `n_combinations` different subsets of coalitions (i.e., not exact),
+ # 1) sampled `n_coalitions` different subsets of coalitions (i.e., not exact),
# 2) using the coalitions which respects the groups in group Shapley values, and/or
# 3) using a combination of approaches where vaeac is only used on a subset of the coalitions.
# Here, objects$S contains the coalitions while objects$X contains the information about the approach.
# Extract the the coalitions / masks which are estimated using vaeac as a matrix
parameters$vaeac.extra_parameters$vaeac.mask_gen_coalitions <-
- S[X[approach == "vaeac"]$id_combination, , drop = FALSE]
+ S[X[approach == "vaeac"]$id_coalition, , drop = FALSE]
# Extract the weights for the corresponding coalitions / masks.
parameters$vaeac.extra_parameters$vaeac.mask_gen_coalitions_prob <-
- X$shapley_weight[X[approach == "vaeac"]$id_combination]
+ X$shapley_weight[X[approach == "vaeac"]$id_coalition]
# Normalize the weights/probabilities such that they sum to one.
parameters$vaeac.extra_parameters$vaeac.mask_gen_coalitions_prob <-
@@ -101,8 +103,8 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
# Check if user provided a pre-trained vaeac model, otherwise, we train one from scratch.
if (is.null(parameters$vaeac.extra_parameters$vaeac.pretrained_vaeac_model)) {
# We train a vaeac model with the parameters in `parameters`, as user did not provide pre-trained vaeac model
- if (parameters$verbose == 2) {
- message(paste0(
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
"Training the `vaeac` model with the provided parameters from scratch on ",
ifelse(parameters$vaeac.extra_parameter$vaeac.cuda, "GPU", "CPU"), "."
))
@@ -137,7 +139,7 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
# The pre-trained vaeac model is either:
# 1. The explanation$internal$parameters$vaeac list of type "vaeac" from an earlier call to explain().
# 2. A string containing the path to where the "vaeac" model is stored on disk.
- if (parameters$verbose == 2) message("Loading the provided `vaeac` model.")
+ if ("vS_details" %in% verbose) cli::cli_text("Loading the provided `vaeac` model.")
# Boolean representing that a pre-trained vaeac model was provided
parameters$vaeac.extra_parameters$vaeac.pretrained_vaeac_model_provided <- TRUE
@@ -146,8 +148,8 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
parameters <- vaeac_update_pretrained_model(parameters = parameters)
# Small printout informing about the location of the model
- if (parameters$verbose == 2) {
- message(paste0(
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
"The `vaeac` model runs/is trained on ", ifelse(parameters$vaeac$parameters$cuda, "GPU", "CPU"), "."
))
}
@@ -172,8 +174,18 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
# Update/overwrite the parameters list in the internal list.
internal$parameters <- parameters
- # Small printout to user
- if (parameters$verbose == 2) message("Done with setting up the `vaeac` approach.\n")
+ if ("vS_details" %in% verbose) {
+ folder_to_save_model <- parameters$vaeac$parameters$folder_to_save_model
+ vaeac_save_file_names <- parameters$vaeac$parameters$vaeac_save_file_names
+
+ cli::cli_alert_info(c(
+ "The trained `vaeac` models are saved to folder {.path {folder_to_save_model}} at\n",
+ "{.path {vaeac_save_file_names[1]}}\n",
+ "{.path {vaeac_save_file_names[2]}}\n",
+ "{.path {vaeac_save_file_names[3]}}"
+ ))
+ }
+
# Return the updated internal list.
return(internal)
@@ -185,23 +197,25 @@ setup_approach.vaeac <- function(internal, # add default values for vaeac here.
#' @export
#' @author Lars Henry Berge Olsen
prepare_data.vaeac <- function(internal, index_features = NULL, ...) {
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ S <- internal$iter_list[[iter]]$S
+
# If not provided, then set `index_features` to all non trivial coalitions
- if (is.null(index_features)) index_features <- seq(2, internal$parameters$n_combinations - 1)
+ if (is.null(index_features)) index_features <- seq(2, n_coalitions - 1)
# Extract objects we are going to need later
- S <- internal$objects$S
seed <- internal$parameters$seed
verbose <- internal$parameters$verbose
x_explain <- internal$data$x_explain
n_explain <- internal$parameters$n_explain
- n_samples <- internal$parameters$n_samples
+ n_MC_samples <- internal$parameters$n_MC_samples
vaeac.model <- internal$parameters$vaeac.model
vaeac.sampler <- internal$parameters$vaeac.sampler
vaeac.checkpoint <- internal$parameters$vaeac.checkpoint
vaeac.batch_size_sampling <- internal$parameters$vaeac.extra_parameters$vaeac.batch_size_sampling
- # Small printout to the user about which batch we are working on
- if (verbose == 2) vaeac_prep_message_batch(internal = internal, index_features = index_features)
# Apply all coalitions to all explicands to get a data table where `vaeac` will impute the `NaN` values
x_explain_extended <- vaeac_get_x_explain_extended(x_explain = x_explain, S = S, index_features = index_features)
@@ -215,7 +229,7 @@ prepare_data.vaeac <- function(internal, index_features = NULL, ...) {
x_explain_with_MC_samples_dt <- vaeac_impute_missing_entries(
x_explain_with_NaNs = x_explain_extended,
n_explain = n_explain,
- n_samples = n_samples,
+ n_MC_samples = n_MC_samples,
vaeac_model = vaeac.model,
checkpoint = vaeac.checkpoint,
sampler = vaeac.sampler,
@@ -314,8 +328,8 @@ prepare_data.vaeac <- function(internal, index_features = NULL, ...) {
#' `mask_gen_coalitions` is specified.
#' @param mask_gen_coalitions Matrix (default is `NULL`). Matrix containing the coalitions that the
#' `vaeac` model will be trained on, see [shapr::specified_masks_mask_generator()]. This parameter is used internally
-#' in `shapr` when we only consider a subset of coalitions/combinations, i.e., when
-#' `n_combinations` \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+#' in `shapr` when we only consider a subset of coalitions, i.e., when
+#' `n_coalitions` \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
#' when `group` is specified in [shapr::explain()].
#' @param mask_gen_coalitions_prob Numeric array (default is `NULL`). Array of length equal to the height
#' of `mask_gen_coalitions` containing the probabilities of sampling the corresponding coalitions in
@@ -334,8 +348,6 @@ prepare_data.vaeac <- function(internal, index_features = NULL, ...) {
#' Abalone data set), it can be advantageous to \eqn{\log} transform the data to unbounded form before using `vaeac`.
#' If `TRUE`, then [shapr::vaeac_postprocess_data()] will take the \eqn{\exp} of the results to get back to strictly
#' positive values when using the `vaeac` model to impute missing values/generate the Monte Carlo samples.
-#' @param verbose Boolean. An integer specifying the level of verbosity. Use `0` (default) for no verbosity,
-#' `1` for low verbose, and `2` for high verbose.
#' @param seed Positive integer (default is `1`). Seed for reproducibility. Specifies the seed before any randomness
#' based code is being run.
#' @param which_vaeac_model String (default is `best`). The name of the `vaeac` model (snapshots from different
@@ -344,6 +356,7 @@ prepare_data.vaeac <- function(internal, index_features = NULL, ...) {
#' Note that additional choices are available if `vaeac.save_every_nth_epoch` is provided. For example, if
#' `vaeac.save_every_nth_epoch = 5`, then `vaeac.which_vaeac_model` can also take the values `"epoch_5"`, `"epoch_10"`,
#' `"epoch_15"`, and so on.
+#' @inheritParams explain
#' @param ... List of extra parameters, currently not used.
#'
#' @return A list containing the training/validation errors and paths to where the vaeac models are saved on the disk.
@@ -472,14 +485,14 @@ vaeac_train_model <- function(x_train,
# Add the number of trainable parameters in the vaeac model to the state list
if (initialization_idx == 1) {
state_list$n_trainable_parameters <- vaeac_model$n_train_param
- if (verbose == 2) {
- message(paste0("The vaeac model contains ", vaeac_model$n_train_param[1, 1], " trainable parameters."))
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0("The vaeac model contains ", vaeac_model$n_train_param[1, 1], " trainable parameters."))
}
}
# Print which initialization vaeac the function is working on
- if (verbose == 2) {
- message(paste0("Initializing vaeac number ", initialization_idx, " of ", n_vaeacs_initialize, "."))
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0("Initializing vaeac model number ", initialization_idx, " of ", n_vaeacs_initialize, "."))
}
# Create the ADAM optimizer
@@ -515,8 +528,8 @@ vaeac_train_model <- function(x_train,
# Check if we are printing detailed debug information
# Small printout to the user stating which initiated vaeac model was the best.
- if (verbose == 2) {
- message(paste0(
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
"Best vaeac inititalization was number ", vaeac_model_best_list$initialization_idx, " (of ", n_vaeacs_initialize,
") with a training VLB = ", round(as.numeric(vaeac_model_best_list$train_vlb[-1]$cpu()), 3),
" after ", epochs_initiation_phase, " epochs. Continue to train this inititalization."
@@ -705,20 +718,17 @@ vaeac_train_model_auxiliary <- function(vaeac_model,
# Save if current vaeac model has the lowest validation IWAE error
if ((max(val_iwae) <= val_iwae_now)$item() || is.null(best_epoch)) {
best_epoch <- epoch
- if (verbose == 2) message("Saving `best` vaeac model at epoch ", epoch, ".")
vaeac_save_state(state_list = state_list, file_name = vaeac_save_file_names[1])
}
# Save if current vaeac model has the lowest running validation IWAE error
if ((max(val_iwae_running) <= val_iwae_running_now)$item() || is.null(best_epoch_running)) {
best_epoch_running <- epoch
- if (verbose == 2) message("Saving `best_running` vaeac model at epoch ", epoch, ".")
vaeac_save_state(state_list = state_list, file_name = vaeac_save_file_names[2])
}
# Save if we are in an n'th epoch and are to save every n'th epoch
if (is.numeric(save_every_nth_epoch) && epoch %% save_every_nth_epoch == 0) {
- if (verbose == 2) message("Saving `nth_epoch` vaeac model at epoch ", epoch, ".")
vaeac_save_state(state_list = state_list, file_name = vaeac_save_file_names[3 + epoch %/% save_every_nth_epoch])
}
}
@@ -742,8 +752,8 @@ vaeac_train_model_auxiliary <- function(vaeac_model,
# Check if we are to apply early stopping, i.e., no improvement in the IWAE for `epochs_early_stopping` epochs.
if (is.numeric(epochs_early_stopping)) {
if (epoch - best_epoch >= epochs_early_stopping) {
- if (verbose == 2) {
- message(paste0(
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
"No IWAE improvment in ", epochs_early_stopping, " epochs. Apply early stopping at epoch ",
epoch, "."
))
@@ -771,11 +781,10 @@ vaeac_train_model_auxiliary <- function(vaeac_model,
)
} else {
# Save the vaeac model at the last epoch
- if (verbose == 2) message("Saving `last` vaeac model at epoch ", epoch, ".")
last_state <- vaeac_save_state(state_list = state_list, file_name = vaeac_save_file_names[3], return_state = TRUE)
# Summary printout
- if (verbose == 2) vaeac_print_train_summary(best_epoch, best_epoch_running, last_state)
+ if ("vS_details" %in% verbose) vaeac_print_train_summary(best_epoch, best_epoch_running, last_state)
# Create a return list
return_list <- list(
@@ -825,14 +834,14 @@ vaeac_train_model_continue <- function(explanation,
lr_new = NULL,
x_train = NULL,
save_data = FALSE,
- verbose = 0,
+ verbose = NULL,
seed = 1) {
# Check the input
if (!"shapr" %in% class(explanation)) stop("`explanation` must be a list of class `shapr`.")
if (!"vaeac" %in% explanation$internal$parameters$approach) stop("`vaeac` is not an approach in `explanation`.")
if (!is.null(lr_new)) vaeac_check_positive_numerics(list(lr_new = lr_new))
if (!is.null(x_train) && !data.table::is.data.table(x_train)) stop("`x_train` must be a `data.table` object.")
- vaeac_check_verbose(verbose)
+ check_verbose(verbose)
vaeac_check_positive_integers(list(epochs_new = epochs_new, seed = seed))
vaeac_check_logicals(list(save_data = save_data))
@@ -998,25 +1007,26 @@ vaeac_train_model_continue <- function(explanation,
#'
#' @inheritParams vaeac_train_model
#' @param x_explain_with_NaNs A 2D matrix, where the missing entries to impute are represented by `NaN`.
-#' @param n_samples Integer. The number of imputed versions we create for each row in `x_explain_with_NaNs`.
+#' @param n_MC_samples Integer. The number of imputed versions we create for each row in `x_explain_with_NaNs`.
#' @param index_features Optional integer vector. Used internally in shapr package to index the coalitions.
#' @param n_explain Positive integer. The number of explicands.
#' @param vaeac_model An initialized `vaeac` model that we are going to use to generate the MC samples.
#' @param checkpoint List containing the parameters of the `vaeac` model.
#' @param sampler A sampler object used to sample the MC samples.
#'
-#' @return A data.table where the missing values (`NaN`) in `x_explain_with_NaNs` have been imputed `n_samples` times.
+#' @return A data.table where the missing values (`NaN`) in `x_explain_with_NaNs` have been imputed `n_MC_samples`
+#' times.
#' The data table will contain extra id columns if `index_features` and `n_explain` are provided.
#'
#' @keywords internal
#' @author Lars Henry Berge Olsen
vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
- n_samples,
+ n_MC_samples,
vaeac_model,
checkpoint,
sampler,
batch_size,
- verbose = 0,
+ verbose = NULL,
seed = NULL,
n_explain = NULL,
index_features = NULL) {
@@ -1031,8 +1041,6 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
torch::torch_manual_seed(seed)
}
- if (verbose == 2) message("Preprocessing the explicands.")
-
# Preprocess `x_explain_with_NaNs`. Turn factor names into numerics 1,2,...,K, (vaeac only accepts numerics) and keep
# track of the maping of names. Optionally log-transform the continuous features. Then, finally, normalize the data
# using the training means and standard deviations. I.e., we assume that the new data follow the same distribution as
@@ -1051,11 +1059,9 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
# Create a data loader that load/iterate over the data set in chronological order.
dataloader <- torch::dataloader(dataset = dataset, batch_size = batch_size, shuffle = FALSE)
- if (verbose == 2) message("Generating the MC samples.")
-
# Create an auxiliary list of lists to store the imputed values combined with the original values. The structure is
# [[i'th MC sample]][[b'th batch]], where the entries are tensors of dimension batch_size x n_features.
- results <- lapply(seq(n_samples), function(k) list())
+ results <- lapply(seq(n_MC_samples), function(k) list())
# Generate the conditional Monte Carlo samples for the observation `x_explain_with_NaNs`, one batch at the time.
coro::loop(for (batch in dataloader) {
@@ -1079,10 +1085,14 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
# Do not need to keep track of the gradients, as we are not fitting the model.
torch::with_no_grad({
# Compute the distribution parameters for the generative models inferred by the masked encoder and decoder.
- # This is a tensor of shape [batch_size, n_samples, n_generative_parameters]. Note that, for only continuous
+ # This is a tensor of shape [batch_size, n_MC_samples, n_generative_parameters]. Note that, for only continuous
# features we have that n_generative_parameters = 2*n_features, but for categorical data the number depends
# on the number of categories.
- samples_params <- vaeac_model$generate_samples_params(batch = batch_extended, mask = mask_extended, K = n_samples)
+ samples_params <- vaeac_model$generate_samples_params(
+ batch = batch_extended,
+ mask = mask_extended,
+ K = n_MC_samples
+ )
# Remove the parameters belonging to added instances in batch_extended.
samples_params <- samples_params[1:batch$shape[1], , ]
@@ -1094,7 +1104,7 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
batch_zeroed_nans[mask] <- 0
# Iterate over the number of imputations and generate the imputed samples
- for (i in seq(n_samples)) {
+ for (i in seq(n_MC_samples)) {
# Extract the i'th inferred generative parameters for the whole batch.
# sample_params is a tensor of shape [batch_size, n_generative_parameters].
sample_params <- samples_params[, i, ]
@@ -1110,24 +1120,26 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
# Make a deep copy and add it to correct location in the results list.
results[[i]] <- append(results[[i]], sample$clone()$detach()$cpu())
- } # End of iterating over the n_samples
+ } # End of iterating over the n_MC_samples
}) # End of iterating over the batches. Done imputing.
- if (verbose == 2) message("Postprocessing the Monte Carlo samples.")
-
- # Order the MC samples into a tensor of shape [nrow(x_explain_with_NaNs), n_samples, n_features]. The lapply function
+ # Order the MC samples into a tensor of shape [nrow(x_explain_with_NaNs), n_MC_samples, n_features].
+ # The lapply function
# creates a list of tensors of shape [nrow(x_explain_with_NaNs), 1, n_features] by concatenating the batches for the
# i'th MC sample to a tensor of shape [nrow(x_explain_with_NaNs), n_features] and then add unsqueeze to add a new
# singleton dimension as the second dimension to get the shape [nrow(x_explain_with_NaNs), 1, n_features]. Then
- # outside of the lapply function, we concatenate the n_samples torch elements to form a final torch result of shape
- # [nrow(x_explain_with_NaNs), n_samples, n_features].
- result <- torch::torch_cat(lapply(seq(n_samples), function(i) torch::torch_cat(results[[i]])$unsqueeze(2)), dim = 2)
+ # outside of the lapply function, we concatenate the n_MC_samples torch elements to form a final torch result of shape
+ # [nrow(x_explain_with_NaNs), n_MC_samples, n_features].
+ result <- torch::torch_cat(lapply(
+ seq(n_MC_samples),
+ function(i) torch::torch_cat(results[[i]])$unsqueeze(2)
+ ), dim = 2)
# Get back to the original distribution by undoing the normalization by multiplying with the std and adding the mean
result <- result * checkpoint$norm_std + checkpoint$norm_mean
- # Convert from a tensor of shape [nrow(x_explain_with_NaNs), n_samples, n_features]
- # to a matrix of shape [(nrow(x_explain_with_NaNs) * n_samples), n_features].
+ # Convert from a tensor of shape [nrow(x_explain_with_NaNs), n_MC_samples, n_features]
+ # to a matrix of shape [(nrow(x_explain_with_NaNs) * n_MC_samples), n_features].
result <- data.table::as.data.table(as.matrix(result$view(c(
result$shape[1] * result$shape[2],
result$shape[3]
@@ -1138,15 +1150,15 @@ vaeac_impute_missing_entries <- function(x_explain_with_NaNs,
# If user provide `index_features`, then we add columns needed for shapr computations
if (!is.null(index_features)) {
- # Add id, id_combination and weights (uniform for the `vaeac` approach) to the result.
- result[, c("id", "id_combination", "w") := list(
- rep(x = seq(n_explain), each = length(index_features) * n_samples),
- rep(x = index_features, each = n_samples, times = n_explain),
- 1 / n_samples
+ # Add id, id_coalition and weights (uniform for the `vaeac` approach) to the result.
+ result[, c("id", "id_coalition", "w") := list(
+ rep(x = seq(n_explain), each = length(index_features) * n_MC_samples),
+ rep(x = index_features, each = n_MC_samples, times = n_explain),
+ 1 / n_MC_samples
)]
# Set the key in the data table
- data.table::setkeyv(result, c("id", "id_combination"))
+ data.table::setkeyv(result, c("id", "id_coalition"))
}
return(result)
@@ -1364,19 +1376,6 @@ vaeac_check_mask_gen <- function(mask_gen_coalitions, mask_gen_coalitions_prob,
}
}
-#' Function that checks the verbose parameter
-#'
-#' @inheritParams vaeac_train_model
-#'
-#' @return The function does not return anything.
-#'
-#' @keywords internal
-#' @author Lars Henry Berge Olsen
-vaeac_check_verbose <- function(verbose) {
- if (!is.numeric(verbose) || !(verbose %in% c(0, 1, 2))) {
- stop("`vaeac.verbose` must be either `0` (no verbosity), `1` (low verbosity), or `2` (high verbosity).")
- }
-}
#' Function that checks that the save folder exists and for a valid file name
#'
@@ -1529,7 +1528,7 @@ vaeac_check_parameters <- function(x_train,
seed,
...) {
# Check verbose parameter
- vaeac_check_verbose(verbose = verbose)
+ check_verbose(verbose = verbose)
# Check that the activation function is valid torch::nn_module object
vaeac_check_activation_func(activation_function = activation_function)
@@ -1655,9 +1654,10 @@ vaeac_check_parameters <- function(x_train,
#' during the training of the vaeac model. Used in [torch::dataloader()].
#' @param vaeac.batch_size_sampling Positive integer (default is `NULL`) The number of samples to include in
#' each batch when generating the Monte Carlo samples. If `NULL`, then the function generates the Monte Carlo samples
-#' for the provided coalitions/combinations and all explicands sent to [shapr::explain()] at the time.
-#' The number of coalitions are determined by `n_batches` in [shapr::explain()]. We recommend to tweak `n_batches`
-#' rather than `vaeac.batch_size_sampling`. Larger batch sizes are often much faster provided sufficient memory.
+#' for the provided coalitions and all explicands sent to [shapr::explain()] at the time.
+#' The number of coalitions are determined by the `n_batches` used by [shapr::explain()]. We recommend to tweak
+#' `adaptive_arguments$max_batch_size` and `adaptive_arguments$min_n_batches`
+#' rather than `vaeac.batch_size_sampling`. Larger batch sizes are often much faster provided sufficient memory.
#' @param vaeac.running_avg_n_values Positive integer (default is `5`). The number of previous IWAE values to include
#' when we compute the running means of the IWAE criterion.
#' @param vaeac.skip_conn_layer Logical (default is `TRUE`). If `TRUE`, we apply identity skip connections in each
@@ -1682,8 +1682,8 @@ vaeac_check_parameters <- function(x_train,
#' `vaeac.mask_gen_coalitions` is specified.
#' @param vaeac.mask_gen_coalitions Matrix (default is `NULL`). Matrix containing the coalitions that the
#' `vaeac` model will be trained on, see [shapr::specified_masks_mask_generator()]. This parameter is used internally
-#' in `shapr` when we only consider a subset of coalitions/combinations, i.e., when
-#' `n_combinations` \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+#' in `shapr` when we only consider a subset of coalitions, i.e., when
+#' `n_coalitions` \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
#' when `group` is specified in [shapr::explain()].
#' @param vaeac.mask_gen_coalitions_prob Numeric array (default is `NULL`). Array of length equal to the height
#' of `vaeac.mask_gen_coalitions` containing the probabilities of sampling the corresponding coalitions in
@@ -1817,8 +1817,8 @@ vaeac_get_mask_generator_name <- function(mask_gen_coalitions,
mask_generator_name <- "specified_masks_mask_generator"
# Small printout
- if (verbose == 2) {
- message(paste0("Using 'specified_masks_mask_generator' with '", nrow(mask_gen_coalitions), "' coalitions."))
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0("Using 'specified_masks_mask_generator' with '", nrow(mask_gen_coalitions), "' coalitions."))
}
} else if (length(masking_ratio) == 1) {
# We are going to use 'mcar_mask_generator' as masking_ratio is a singleton.
@@ -1826,15 +1826,21 @@ vaeac_get_mask_generator_name <- function(mask_gen_coalitions,
mask_generator_name <- "mcar_mask_generator"
# Small printout
- if (verbose == 2) message(paste0("Using 'mcar_mask_generator' with 'masking_ratio = ", masking_ratio, "'."))
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
+ "Using 'mcar_mask_generator' with 'masking_ratio = ",
+ masking_ratio,
+ "'."
+ ))
+ }
} else if (length(masking_ratio) > 1) {
# We are going to use 'specified_prob_mask_generator' as masking_ratio is a vector (of same length as ncol(x_train).
# I.e., masking_ratio[5] specifies the probability of masking 5 features
mask_generator_name <- "specified_prob_mask_generator"
# We have an array of masking ratios. Then we are using the specified_prob_mask_generator.
- if (verbose == 2) {
- message(paste0(
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
"Using 'specified_prob_mask_generator' mask generator with 'masking_ratio = [",
paste(masking_ratio, collapse = ", "), "]'."
))
@@ -2104,10 +2110,12 @@ vaeac_get_data_objects <- function(x_train,
# Ensure a valid batch size
if (batch_size > length(train_indices)) {
- message(paste0(
- "Decrease `batch_size` (", batch_size, ") to largest allowed value (", length(train_indices), "), ",
- "i.e., the number of training observations."
- ))
+ if ("vS_details" %in% verbose) {
+ cli::cli_text(paste0(
+ "Decrease `batch_size` (", batch_size, ") to largest allowed value (", length(train_indices), "), ",
+ "i.e., the number of training observations."
+ ))
+ }
batch_size <- length(train_indices)
}
@@ -2429,17 +2437,31 @@ Last epoch: %d. \tVLB = %.3f \tIWAE = %.3f \tIWAE_running = %.3f\n",
last_state$val_iwae[-1]$cpu(),
last_state$val_iwae_running[-1]$cpu()
))
-}
-#' Produce message about which batch prepare_data is working on
-#' @inheritParams default_doc
-#' @inheritParams default_doc_explain
-#' @author Lars Henry Berge Olsen
-#' @keywords internal
-vaeac_prep_message_batch <- function(internal, index_features) {
- id_batch <- internal$objects$X[id_combination == index_features[1]]$batch
- n_batches <- internal$parameters$n_batches
- message(paste0("Generating Monte Carlo samples using `vaeac` for batch ", id_batch, " of ", n_batches, "."))
+ # Trying to replace the above, but have not succeeded really.
+ # msg <- c("\nResults of the `vaeac` training process:",
+ # sprintf("Best epoch: %d. \tVLB = %.3f \tIWAE = %.3f \tIWAE_running = %.3f",
+ # best_epoch,
+ # last_state$train_vlb[best_epoch]$cpu(),
+ # last_state$val_iwae[best_epoch]$cpu(),
+ # last_state$val_iwae_running[best_epoch]$cpu()
+ # ),
+ # sprintf("Best running avg epoch: %d. \tVLB = %.3f \tIWAE = %.3f \tIWAE_running = %.3f",
+ # best_epoch_running,
+ # last_state$train_vlb[best_epoch_running]$cpu(),
+ # last_state$val_iwae[best_epoch_running]$cpu(),
+ # last_state$val_iwae_running[best_epoch_running]$cpu()
+ # ),
+ # sprintf("Last epoch: %d. \tVLB = %.3f \tIWAE = %.3f \tIWAE_running = %.3f",
+ # last_state$epoch,
+ # last_state$train_vlb[-1]$cpu(),
+ # last_state$val_iwae[-1]$cpu(),
+ # last_state$val_iwae_running[-1]$cpu()
+ # )
+ # )
+ #
+ #
+ # cli::cli_text(msg)
}
# Plot functions =======================================================================================================
@@ -2501,7 +2523,7 @@ vaeac_prep_message_batch <- function(internal, index_features) {
#' x_train = x_train,
#' approach = approach,
#' prediction_zero = p0,
-#' n_samples = 1, # As we are only interested in the training of the vaeac
+#' n_MC_samples = 1, # As we are only interested in the training of the vaeac
#' vaeac.epochs = 10, # Should be higher in applications.
#' vaeac.n_vaeacs_initialize = 1,
#' vaeac.width = 16,
@@ -2515,7 +2537,7 @@ vaeac_prep_message_batch <- function(internal, index_features) {
#' x_train = x_train,
#' approach = approach,
#' prediction_zero = p0,
-#' n_samples = 1, # As we are only interested in the training of the vaeac
+#' n_MC_samples = 1, # As we are only interested in the training of the vaeac
#' vaeac.epochs = 10, # Should be higher in applications.
#' vaeac.width = 16,
#' vaeac.depth = 2,
@@ -2736,7 +2758,7 @@ vaeac_plot_eval_crit <- function(explanation_list,
#' x_train = x_train,
#' approach = "vaeac",
#' prediction_zero = mean(y_train),
-#' n_samples = 1,
+#' n_MC_samples = 1,
#' vaeac.epochs = 10,
#' vaeac.n_vaeacs_initialize = 1
#' )
@@ -2815,7 +2837,7 @@ vaeac_plot_imputed_ggpairs <- function(
checkpoint <- torch::torch_load(vaeac_model_path)
# Get the number of observations in the x_true and features
- n_samples <- if (is.null(x_true)) 500 else nrow(x_true)
+ n_MC_samples <- if (is.null(x_true)) 500 else nrow(x_true)
n_features <- checkpoint$n_features
# Checking for valid dimension
@@ -2830,12 +2852,12 @@ vaeac_plot_imputed_ggpairs <- function(
# Impute the missing entries using the vaeac approach. Here we generate x from p(x), so no conditioning.
imputed_values <- vaeac_impute_missing_entries(
- x_explain_with_NaNs = matrix(NaN, n_samples, checkpoint$n_features),
- n_samples = 1,
+ x_explain_with_NaNs = matrix(NaN, n_MC_samples, checkpoint$n_features),
+ n_MC_samples = 1,
vaeac_model = vaeac_model,
checkpoint = checkpoint,
sampler = explanation$internal$parameters$vaeac.sampler,
- batch_size = n_samples,
+ batch_size = n_MC_samples,
verbose = explanation$internal$parameters$verbose,
seed = explanation$internal$parameters$seed
)
@@ -2847,7 +2869,7 @@ vaeac_plot_imputed_ggpairs <- function(
# Add type variable representing if they are imputed samples or from `x_true`
combined_data$type <-
- factor(rep(c("True", "Imputed"), times = c(ifelse(is.null(nrow(x_true)), 0, nrow(x_true)), n_samples)))
+ factor(rep(c("True", "Imputed"), times = c(ifelse(is.null(nrow(x_true)), 0, nrow(x_true)), n_MC_samples)))
# Create the ggpairs figure and potentially add title based on the description of the used vaeac model
figure <- GGally::ggpairs(
diff --git a/R/approach_vaeac_torch_modules.R b/R/approach_vaeac_torch_modules.R
index e353327ab..da0118f94 100644
--- a/R/approach_vaeac_torch_modules.R
+++ b/R/approach_vaeac_torch_modules.R
@@ -1525,7 +1525,7 @@ gauss_cat_sampler_most_likely <- function(one_hot_max_sizes, min_sigma = 1e-4, m
distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
col_sample <- torch::torch_argmax(distr$probs, -1)[, NULL]$to(dtype = torch::torch_float()) # Most lik class
}
- sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
+ sample <- append(sample, col_sample) # Add the vector of sampled values for the i-th feature to the sample list
}
return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
}
@@ -1587,7 +1587,7 @@ gauss_cat_sampler_random <- function(one_hot_max_sizes, min_sigma = 1e-4, min_pr
distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
col_sample <- distr$sample()$unsqueeze(-1)$to(dtype = torch::torch_float()) # Sample class using class prob
}
- sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
+ sample <- append(sample, col_sample) # Add the vector of sampled values for the i-th feature to the sample list
}
return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
}
@@ -1656,7 +1656,7 @@ gauss_cat_parameters <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob =
distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
current_parameters <- distr$probs # Extract the current probabilities for each classs
}
- parameters <- append(parameters, current_parameters) # Add the i´th feature's parameters to the parameters list
+ parameters <- append(parameters, current_parameters) # Add the i-th feature's parameters to the parameters list
}
return(torch::torch_cat(parameters, -1)) # Create a 2D torch_tensor by column binding the tensors in the list
}
@@ -1821,7 +1821,7 @@ categorical_to_one_hot_layer <- function(one_hot_max_sizes, add_nans_map_for_col
# ONLY FOR CONTINUOUS FEATURES: out_cols now is a list of n_features tensors of shape n x size = n x 1 for
# continuous variables. We concatenate them to a matrix of dim n x 2*n_features (in cont case) for prior net, but
# for proposal net, it is n x 3*n_features, and they take the form
- # [batch1, is.nan1, batch2, is.nan2, …, batch12, is.nan12, mask1, mask2, …, mask12]
+ # [batch1, is.nan1, batch2, is.nan2, ..., batch12, is.nan12, mask1, mask2, ..., mask12]
return(out_cols)
}
)
diff --git a/R/check_convergence.R b/R/check_convergence.R
new file mode 100644
index 000000000..2f2ccc122
--- /dev/null
+++ b/R/check_convergence.R
@@ -0,0 +1,82 @@
+#' Checks the convergence according to the convergence threshold
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+check_convergence <- function(internal) {
+ iter <- length(internal$iter_list)
+
+ convergence_tolerance <- internal$parameters$adaptive_arguments$convergence_tolerance
+ max_iter <- internal$parameters$adaptive_arguments$max_iter
+ max_n_coalitions <- internal$parameters$adaptive_arguments$max_n_coalitions
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+ n_shapley_values <- internal$parameters$n_shapley_values
+
+ exact <- internal$iter_list[[iter]]$exact
+
+ dt_shapley_est <- internal$iter_list[[iter]]$dt_shapley_est
+ dt_shapley_sd <- internal$iter_list[[iter]]$dt_shapley_sd
+
+ n_sampled_coalitions <- internal$iter_list[[iter]]$n_coalitions - 2 # Subtract the zero and full predictions
+
+ max_sd <- dt_shapley_sd[, max(.SD), .SDcols = -1, by = .I]$V1 # Max per prediction
+ max_sd0 <- max_sd * sqrt(n_sampled_coalitions) # Scales UP the sd as it scales at this rate
+
+ dt_shapley_est0 <- copy(dt_shapley_est)
+
+ est_required_coals_per_ex_id <- est_required_coalitions <- est_remaining_coalitions <- overall_conv_measure <- NA
+
+ if (isTRUE(exact)) {
+ converged_exact <- TRUE
+ converged_sd <- FALSE
+ } else {
+ converged_exact <- FALSE
+ if (!is.null(convergence_tolerance)) {
+ dt_shapley_est0[, maxval := max(.SD), .SDcols = -c(1, 2), by = .I]
+ dt_shapley_est0[, minval := min(.SD), .SDcols = -c(1, 2), by = .I]
+ dt_shapley_est0[, max_sd0 := max_sd0]
+ dt_shapley_est0[, req_samples := (max_sd0 / ((maxval - minval) * convergence_tolerance))^2]
+ dt_shapley_est0[, conv_measure := max_sd0 / ((maxval - minval) * sqrt(n_sampled_coalitions))]
+ dt_shapley_est0[, req_samples := min(req_samples, 2^n_shapley_values - 2)]
+
+ est_required_coalitions <- ceiling(dt_shapley_est0[, median(req_samples)]) # TODO:Consider other ways to do this
+ if (isTRUE(paired_shap_sampling)) {
+ est_required_coalitions <- ceiling(est_required_coalitions * 0.5) * 2
+ }
+ est_remaining_coalitions <- max(0, est_required_coalitions - (n_sampled_coalitions + 2))
+
+ overall_conv_measure <- dt_shapley_est0[, median(conv_measure)] # TODO:Consider other ways to do this
+
+ converged_sd <- (est_remaining_coalitions == 0)
+
+ est_required_coals_per_ex_id <- dt_shapley_est0[, req_samples]
+ names(est_required_coals_per_ex_id) <- paste0(
+ "req_samples_explain_id_",
+ seq_along(est_required_coals_per_ex_id)
+ )
+ } else {
+ converged_sd <- FALSE
+ }
+ }
+
+ converged_max_n_coalitions <- (n_sampled_coalitions + 2 >= max_n_coalitions)
+
+ converged_max_iter <- (iter >= max_iter)
+
+ converged <- converged_exact || converged_sd || converged_max_iter || converged_max_n_coalitions
+
+ internal$iter_list[[iter]]$converged <- converged
+ internal$iter_list[[iter]]$converged_exact <- converged_exact
+ internal$iter_list[[iter]]$converged_sd <- converged_sd
+ internal$iter_list[[iter]]$converged_max_iter <- converged_max_iter
+ internal$iter_list[[iter]]$converged_max_n_coalitions <- converged_max_n_coalitions
+ internal$iter_list[[iter]]$est_required_coalitions <- est_required_coalitions
+ internal$iter_list[[iter]]$est_remaining_coalitions <- est_remaining_coalitions
+ internal$iter_list[[iter]]$est_required_coals_per_ex_id <- as.list(est_required_coals_per_ex_id)
+ internal$iter_list[[iter]]$overall_conv_measure <- overall_conv_measure
+
+ internal$timing_list$check_convergence <- Sys.time()
+
+ return(internal)
+}
diff --git a/R/cli.R b/R/cli.R
new file mode 100644
index 000000000..ba4d70c57
--- /dev/null
+++ b/R/cli.R
@@ -0,0 +1,71 @@
+cli_startup <- function(internal, model, verbose) {
+ init_time <- internal$timing_list$init_time
+
+ is_groupwise <- internal$parameters$is_groupwise
+ approach <- internal$parameters$approach
+ adaptive <- internal$parameters$adaptive
+ n_shapley_values <- internal$parameters$n_shapley_values
+ n_explain <- internal$parameters$n_explain
+ saving_path <- internal$parameters$adaptive_arguments$saving_path
+
+ feat_group_txt <- ifelse(is_groupwise, "group-wise", "feature-wise")
+ adaptive_txt <- ifelse(adaptive, "adaptive", "non-adaptive")
+
+ testing <- internal$parameters$testing
+
+
+ line_vec <- "Model class: {.cls {class(model)}}"
+ line_vec <- c(line_vec, "Approach: {.emph {approach}}")
+ line_vec <- c(line_vec, "Adaptive estimation: {.emph {adaptive}}")
+ line_vec <- c(line_vec, "Number of {.emph {feat_group_txt}} Shapley values: {n_shapley_values}")
+ line_vec <- c(line_vec, "Number of observations to explain: {n_explain}")
+ if (isFALSE(testing)) {
+ line_vec <- c(line_vec, "Computations (temporary) saved at: {.path {saving_path}}")
+ }
+
+
+ if ("basic" %in% verbose) {
+ if (isFALSE(testing)) {
+ cli::cli_h1("Starting {.fn shapr::explain} at {round(init_time)}")
+ }
+ cli::cli_ul(line_vec)
+ }
+
+ if ("vS_details" %in% verbose) {
+ if (any(c("regression_surrogate", "regression_separate") %in% approach)) {
+ reg_desc <- paste0(capture.output(internal$parameters$regression.model), collapse = "\n")
+ cli::cli_h3("Additional details about the regression model")
+ cli::cli_text(reg_desc)
+ }
+ }
+
+ if ("basic" %in% verbose) {
+ if (isTRUE(adaptive)) {
+ msg <- "Adaptive computation started"
+ } else {
+ msg <- "Main computation started"
+ }
+ cli::cli_h2(cli::col_blue(msg))
+ }
+}
+
+
+cli_iter <- function(verbose, internal, iter) {
+ adaptive <- internal$parameters$adaptive
+
+ if (!is.null(verbose) && isTRUE(adaptive)) {
+ cli::cli_h1("Iteration {iter}")
+ }
+
+ if ("basic" %in% verbose) {
+ new_coal <- internal$iter_list[[iter]]$new_n_coalitions
+ tot_coal <- internal$iter_list[[iter]]$n_coalitions
+ all_coal <- 2^internal$parameters$n_shapley_values
+
+ extra_msg <- ifelse(adaptive, ", {new_coal} new", "")
+
+ msg <- paste0("Using {tot_coal} of {all_coal} coalitions", extra_msg, ". ")
+
+ cli::cli_alert_info(msg)
+ }
+}
diff --git a/R/compute_estimates.R b/R/compute_estimates.R
new file mode 100644
index 000000000..b0a16db88
--- /dev/null
+++ b/R/compute_estimates.R
@@ -0,0 +1,369 @@
+#' Computes the the Shapley values and their standard deviation given the `v(S)`
+#'
+#' @inheritParams default_doc_explain
+#' @param vS_list List
+#' Output from [compute_vS()]
+#'
+#' @export
+#' @keywords internal
+compute_estimates <- function(internal, vS_list) {
+ verbose <- internal$parameters$verbose
+ cli_id <- internal$parameter$cli_id
+
+ internal$timing_list$compute_vS <- Sys.time()
+
+
+ iter <- length(internal$iter_list)
+ compute_sd <- internal$iter_list[[iter]]$compute_sd
+
+ n_boot_samps <- internal$parameters$adaptive_arguments$n_boot_samps
+
+ processed_vS_list <- postprocess_vS_list(
+ vS_list = vS_list,
+ internal = internal
+ )
+
+ internal$timing_list$postprocess_vS <- Sys.time()
+
+
+ if ("progress" %in% verbose) {
+ cli::cli_progress_step("Computing Shapley value estimates")
+ }
+
+ # Compute the Shapley values
+ dt_shapley_est <- compute_shapley_new(internal, processed_vS_list$dt_vS)
+
+ internal$timing_list$compute_shapley <- Sys.time()
+
+ if (compute_sd) {
+ if ("progress" %in% verbose) {
+ cli::cli_progress_step("Boostrapping Shapley value sds")
+ }
+
+ dt_shapley_sd <- bootstrap_shapley_new(internal, n_boot_samps = n_boot_samps, processed_vS_list$dt_vS)
+
+ internal$timing_list$compute_bootstrap <- Sys.time()
+ } else {
+ dt_shapley_sd <- dt_shapley_est * 0
+ }
+
+
+
+ # Adding explain_id to the output dt
+ dt_shapley_est[, explain_id := .I]
+ setcolorder(dt_shapley_est, "explain_id")
+ dt_shapley_sd[, explain_id := .I]
+ setcolorder(dt_shapley_sd, "explain_id")
+
+
+ internal$iter_list[[iter]]$dt_shapley_est <- dt_shapley_est
+ internal$iter_list[[iter]]$dt_shapley_sd <- dt_shapley_sd
+ internal$iter_list[[iter]]$vS_list <- vS_list
+ internal$iter_list[[iter]]$dt_vS <- processed_vS_list$dt_vS
+
+ # internal$timing$shapley_computation <- Sys.time()
+
+ # Clearing out the tmp list with model and predict_model (only added for AICc-types of empirical approach)
+ internal$output <- processed_vS_list
+
+ if ("basic" %in% verbose) {
+ cli::cli_progress_done()
+ }
+
+ return(internal)
+}
+
+#' @keywords internal
+postprocess_vS_list <- function(vS_list, internal) {
+ keep_samp_for_vS <- internal$parameters$keep_samp_for_vS
+ prediction_zero <- internal$parameters$prediction_zero
+ n_explain <- internal$parameters$n_explain
+
+ # Appending the zero-prediction to the list
+ dt_vS0 <- as.data.table(rbind(c(1, rep(prediction_zero, n_explain))))
+
+ # Extracting/merging the data tables from the batch running
+ # TODO: Need a memory and speed optimized way to transform the output form dt_vS_list to two different lists,
+ # I.e. without copying the data more than once. For now I have modified run_batch such that it
+ # if keep_samp_for_vS=FALSE
+ # then there is only one copy, but there are two if keep_samp_for_vS=TRUE. This might be OK since the
+ # latter is used rarely
+ if (keep_samp_for_vS) {
+ names(dt_vS0) <- names(vS_list[[1]][[1]])
+
+ vS_list[[length(vS_list) + 1]] <- list(dt_vS0, NULL)
+
+ dt_vS <- rbindlist(lapply(vS_list, `[[`, 1))
+
+ dt_samp_for_vS <- rbindlist(lapply(vS_list, `[[`, 2), use.names = TRUE)
+
+ data.table::setorder(dt_samp_for_vS, id_coalition)
+ } else {
+ names(dt_vS0) <- names(vS_list[[1]])
+
+ vS_list[[length(vS_list) + 1]] <- dt_vS0
+
+ dt_vS <- rbindlist(vS_list)
+ dt_samp_for_vS <- NULL
+ }
+
+ data.table::setorder(dt_vS, id_coalition)
+
+ dt_vS <- unique(dt_vS, by = "id_coalition") # To remove duplicated full pred row in the iterative procedure
+
+ output <- list(
+ dt_vS = dt_vS,
+ dt_samp_for_vS = dt_samp_for_vS
+ )
+ return(output)
+}
+
+
+#' Compute shapley values
+#' @param dt_vS The contribution matrix.
+#'
+#' @inheritParams default_doc
+#'
+#' @return A `data.table` with Shapley values for each test observation.
+#' @export
+#' @keywords internal
+compute_shapley_new <- function(internal, dt_vS) {
+ is_groupwise <- internal$parameters$is_groupwise
+ type <- internal$parameters$type
+
+ iter <- length(internal$iter_list)
+
+ W <- internal$iter_list[[iter]]$W
+
+ shap_names <- internal$parameters$shap_names
+
+ # If multiple horizons with explain_forecast are used, we only distribute value to those used at each horizon
+ if (type == "forecast") {
+ id_coalition_mapper_dt <- internal$objects$id_coalition_mapper_dt
+ horizon <- internal$parameters$horizon
+ cols_per_horizon <- internal$objects$cols_per_horizon
+ W_list <- internal$objects$W_list
+
+ kshap_list <- list()
+ for (i in seq_len(horizon)) {
+ W0 <- W_list[[i]]
+
+ dt_vS0 <- merge(dt_vS, id_coalition_mapper_dt[horizon == i], by = "id_coalition", all.y = TRUE)
+ data.table::setorder(dt_vS0, horizon_id_coalition)
+ these_vS0_cols <- grep(paste0("p_hat", i, "_"), names(dt_vS0))
+
+ kshap0 <- t(W0 %*% as.matrix(dt_vS0[, these_vS0_cols, with = FALSE]))
+ kshap_list[[i]] <- data.table::as.data.table(kshap0)
+
+ if (!is_groupwise) {
+ names(kshap_list[[i]]) <- c("none", cols_per_horizon[[i]])
+ } else {
+ names(kshap_list[[i]]) <- c("none", shap_names)
+ }
+ }
+
+ dt_kshap <- cbind(internal$parameters$output_labels, rbindlist(kshap_list, fill = TRUE))
+ } else {
+ kshap <- t(W %*% as.matrix(dt_vS[, -"id_coalition"]))
+ dt_kshap <- data.table::as.data.table(kshap)
+ colnames(dt_kshap) <- c("none", shap_names)
+ }
+
+ return(dt_kshap)
+}
+
+bootstrap_shapley <- function(internal, dt_vS, n_boot_samps = 100, seed = 123) {
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+
+ set.seed(seed)
+
+ X_org <- copy(X)
+ n_explain <- internal$parameters$n_explain
+ n_features <- internal$parameters$n_features
+ shap_names <- internal$parameters$shap_names
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+ shapley_reweight <- internal$parameters$shapley_reweighting
+
+ boot_sd_array <- array(NA, dim = c(n_explain, n_features + 1, n_boot_samps))
+
+ X_keep <- X_org[c(1, .N), .(id_coalition, features, n_features, N, shapley_weight)]
+ X_samp <- X_org[-c(1, .N), .(id_coalition, features, n_features, N, shapley_weight, sample_freq)]
+ X_samp[, features_tmp := sapply(features, paste, collapse = " ")]
+
+ n_coalitions_boot <- X_samp[, sum(sample_freq)]
+
+ for (i in seq_len(n_boot_samps)) {
+ if (paired_shap_sampling) {
+ # Sample with replacement
+ X_boot00 <- X_samp[
+ sample.int(
+ n = .N,
+ size = ceiling(n_coalitions_boot / 2),
+ replace = TRUE,
+ prob = sample_freq
+ ),
+ .(id_coalition, features, n_features, N)
+ ]
+
+ X_boot00[, features_tmp := sapply(features, paste, collapse = " ")]
+ # Not sure why I have to two the next two lines in two steps, but I don't get it to work otherwise
+ boot_features_dup <- lapply(X_boot00$features, function(x) seq(n_features)[-x])
+ X_boot00[, features_dup := boot_features_dup]
+ X_boot00[, features_dup_tmp := sapply(features_dup, paste, collapse = " ")]
+
+ # Extract the paired coalitions from X_samp
+ X_boot00_paired <- merge(X_boot00[, .(features_dup_tmp)],
+ X_samp[, .(id_coalition, features, n_features, N, features_tmp)],
+ by.x = "features_dup_tmp", by.y = "features_tmp"
+ )
+ X_boot0 <- rbind(
+ X_boot00[, .(id_coalition, features, n_features, N)],
+ X_boot00_paired[, .(id_coalition, features, n_features, N)]
+ )
+ } else {
+ X_boot0 <- X_samp[
+ sample.int(
+ n = .N,
+ size = n_coalitions_boot,
+ replace = TRUE,
+ prob = sample_freq
+ ),
+ .(id_coalition, features, n_features, N)
+ ]
+ }
+
+
+ X_boot0[, shapley_weight := .N / n_coalitions_boot, by = "id_coalition"]
+ X_boot0 <- unique(X_boot0, by = "id_coalition")
+
+ X_boot <- rbind(X_keep, X_boot0)
+ data.table::setorder(X_boot, id_coalition)
+
+ shapley_reweighting(X_boot, reweight = shapley_reweight) # reweights the shapley weights by reference
+
+ W_boot <- shapr::weight_matrix(
+ X = X_boot,
+ normalize_W_weights = TRUE,
+ is_groupwise = FALSE
+ )
+
+ kshap_boot <- t(W_boot %*% as.matrix(dt_vS[id_coalition %in% X_boot[, id_coalition], -"id_coalition"]))
+
+ boot_sd_array[, , i] <- copy(kshap_boot)
+ }
+
+ std_dev_mat <- apply(boot_sd_array, c(1, 2), sd)
+
+ dt_kshap_boot_sd <- data.table::as.data.table(std_dev_mat)
+ colnames(dt_kshap_boot_sd) <- c("none", shap_names)
+
+ return(dt_kshap_boot_sd)
+}
+
+bootstrap_shapley_new <- function(internal, dt_vS, n_boot_samps = 100, seed = 123) {
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+
+ set.seed(seed)
+
+ is_groupwise <- internal$parameters$is_groupwise
+
+ n_explain <- internal$parameters$n_explain
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+ shapley_reweight <- internal$parameters$shapley_reweighting
+ shap_names <- internal$parameters$shap_names
+ n_shapley_values <- internal$parameters$n_shapley_values
+
+
+ X_org <- copy(X)
+
+ boot_sd_array <- array(NA, dim = c(n_explain, n_shapley_values + 1, n_boot_samps))
+
+ X_keep <- X_org[c(1, .N), .(id_coalition, coalitions, coalition_size, N)]
+ X_samp <- X_org[-c(1, .N), .(id_coalition, coalitions, coalition_size, N, shapley_weight, sample_freq)]
+ X_samp[, coalitions_tmp := sapply(coalitions, paste, collapse = " ")]
+
+ n_coalitions_boot <- X_samp[, sum(sample_freq)]
+
+ if (paired_shap_sampling) {
+ # Sample with replacement
+ X_boot00 <- X_samp[
+ sample.int(
+ n = .N,
+ size = ceiling(n_coalitions_boot * n_boot_samps / 2),
+ replace = TRUE,
+ prob = sample_freq
+ ),
+ .(id_coalition, coalitions, coalition_size, N, sample_freq)
+ ]
+
+ X_boot00[, boot_id := rep(seq(n_boot_samps), times = n_coalitions_boot / 2)]
+
+
+ X_boot00_paired <- copy(X_boot00[, .(coalitions, boot_id)])
+ X_boot00_paired[, coalitions := lapply(coalitions, function(x) seq(n_shapley_values)[-x])]
+ X_boot00_paired[, coalitions_tmp := sapply(coalitions, paste, collapse = " ")]
+
+ # Extract the paired coalitions from X_samp
+ X_boot00_paired <- merge(X_boot00_paired,
+ X_samp[, .(id_coalition, coalition_size, N, shapley_weight, coalitions_tmp)],
+ by = "coalitions_tmp"
+ )
+ X_boot0 <- rbind(
+ X_boot00[, .(boot_id, id_coalition, coalitions, coalition_size, N)],
+ X_boot00_paired[, .(boot_id, id_coalition, coalitions, coalition_size, N)]
+ )
+
+ X_boot <- rbind(X_keep[rep(1:2, each = n_boot_samps), ][, boot_id := rep(seq(n_boot_samps), times = 2)], X_boot0)
+ setkey(X_boot, boot_id, id_coalition)
+ X_boot[, sample_freq := .N / n_coalitions_boot, by = .(id_coalition, boot_id)]
+ X_boot <- unique(X_boot, by = c("id_coalition", "boot_id"))
+ X_boot[, shapley_weight := sample_freq]
+ X_boot[coalition_size %in% c(0, n_shapley_values), shapley_weight := X_org[1, shapley_weight]]
+ } else {
+ X_boot0 <- X_samp[
+ sample.int(
+ n = .N,
+ size = n_coalitions_boot * n_boot_samps,
+ replace = TRUE,
+ prob = sample_freq
+ ),
+ .(id_coalition, coalitions, coalition_size, N)
+ ]
+ X_boot <- rbind(X_keep[rep(1:2, each = n_boot_samps), ], X_boot0)
+ X_boot[, boot_id := rep(seq(n_boot_samps), times = n_coalitions_boot + 2)]
+
+ setkey(X_boot, boot_id, id_coalition)
+ X_boot[, sample_freq := .N / n_coalitions_boot, by = .(id_coalition, boot_id)]
+ X_boot <- unique(X_boot, by = c("id_coalition", "boot_id"))
+ X_boot[, shapley_weight := sample_freq]
+ X_boot[coalition_size %in% c(0, n_shapley_values), shapley_weight := X_org[1, shapley_weight]]
+ }
+
+ for (i in seq_len(n_boot_samps)) {
+ this_X <- X_boot[boot_id == i] # This is highly inefficient, but the best way to deal with the reweighting for now
+ shapley_reweighting(this_X, reweight = shapley_reweight)
+
+ W_boot <- weight_matrix(
+ X = this_X,
+ normalize_W_weights = TRUE
+ )
+
+ kshap_boot <- t(W_boot %*% as.matrix(dt_vS[id_coalition %in% X_boot[
+ boot_id == i,
+ id_coalition
+ ], -"id_coalition"]))
+
+ boot_sd_array[, , i] <- copy(kshap_boot)
+ }
+
+ std_dev_mat <- apply(boot_sd_array, c(1, 2), sd)
+
+ dt_kshap_boot_sd <- data.table::as.data.table(std_dev_mat)
+ colnames(dt_kshap_boot_sd) <- c("none", shap_names)
+
+ return(dt_kshap_boot_sd)
+}
diff --git a/R/compute_vS.R b/R/compute_vS.R
index 1c6deb190..e235810bf 100644
--- a/R/compute_vS.R
+++ b/R/compute_vS.R
@@ -1,23 +1,49 @@
#' Computes `v(S)` for all features subsets `S`.
#'
+#' @inheritParams default_doc_explain
#' @inheritParams default_doc
-#' @inheritParams explain
#'
#' @param method Character
#' Indicates whether the lappy method (default) or loop method should be used.
+#' This is only used for testing purposes.
#'
#' @export
+#' @keywords internal
compute_vS <- function(internal, model, predict_model, method = "future") {
- S_batch <- internal$objects$S_batch
+ iter <- length(internal$iter_list)
+
+ S_batch <- internal$iter_list[[iter]]$S_batch
+
+ verbose <- internal$parameters$verbose
+ approach <- internal$parameters$approach
+
+ # verbose
+ if ("progress" %in% verbose) {
+ cli::cli_progress_step("Computing vS")
+ }
+ if ("vS_details" %in% verbose) {
+ if ("regression_separate" %in% approach) {
+ tuning <- internal$parameters$regression.tune
+ if (isTRUE(tuning)) {
+ cli::cli_h2("Extra info about the tuning of the regression model")
+ }
+ }
+ }
+
if (method == "future") {
- ret <- future_compute_vS_batch(S_batch = S_batch, internal = internal, model = model, predict_model = predict_model)
+ vS_list <- future_compute_vS_batch(
+ S_batch = S_batch,
+ internal = internal,
+ model = model,
+ predict_model = predict_model
+ )
} else {
# Doing the same as above without future without progressbar or paralellization
- ret <- list()
+ vS_list <- list()
for (i in seq_along(S_batch)) {
S <- S_batch[[i]]
- ret[[i]] <- batch_compute_vS(
+ vS_list[[i]] <- batch_compute_vS(
S = S,
internal = internal,
model = model,
@@ -26,7 +52,40 @@ compute_vS <- function(internal, model, predict_model, method = "future") {
}
}
- return(ret)
+ #### Adds v_S output above to any vS_list already computed ####
+ ### Need to map the old id_coalitions to the new numbers for this merging to work out
+ if (iter > 1) {
+ prev_coalition_map <- internal$iter_list[[iter - 1]]$coalition_map
+ prev_vS_list <- internal$iter_list[[iter - 1]]$vS_list
+
+ current_coalition_map <- internal$iter_list[[iter]]$coalition_map
+
+
+ # Creates a mapper from the last id_coalition to the new id_coalition numbering
+ id_coalitions_mapper <- merge(prev_coalition_map,
+ current_coalition_map,
+ by = "coalitions_str",
+ suffixes = c("", "_new")
+ )
+ prev_vS_list_new <- list()
+
+ # Applies the mapper to update the prev_vS_list ot the new id_coalition numbering
+ for (k in seq_along(prev_vS_list)) {
+ prev_vS_list_new[[k]] <- merge(prev_vS_list[[k]],
+ id_coalitions_mapper[, .(id_coalition, id_coalition_new)],
+ by = "id_coalition"
+ )
+ prev_vS_list_new[[k]][, id_coalition := id_coalition_new]
+ prev_vS_list_new[[k]][, id_coalition_new := NULL]
+ }
+
+ # Merge the new vS_list with the old vS_list
+ vS_list <- c(prev_vS_list_new, vS_list)
+ }
+
+
+
+ return(vS_list)
}
future_compute_vS_batch <- function(S_batch, internal, model, predict_model) {
@@ -70,25 +129,29 @@ batch_compute_vS <- function(S, internal, model, predict_model, p = NULL) {
#' @keywords internal
#' @author Lars Henry Berge Olsen
batch_prepare_vS_regression <- function(S, internal) {
- max_id_comb <- internal$parameters$n_combinations
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+
+ max_id_coal <- X[, .N]
x_explain_y_hat <- internal$data$x_explain_y_hat
# Compute the contribution functions different based on if the grand coalition is in S or not
- if (!(max_id_comb %in% S)) {
+ if (!(max_id_coal %in% S)) {
dt <- prepare_data(internal, index_features = S)
} else {
# Remove the grand coalition. NULL is for the special case for when the batch only includes the grand coalition.
- dt <- if (length(S) > 1) prepare_data(internal, index_features = S[S != max_id_comb]) else NULL
+ dt <- if (length(S) > 1) prepare_data(internal, index_features = S[S != max_id_coal]) else NULL
# Add the results for the grand coalition (Need to add names in case the batch only contains the grand coalition)
- dt <- rbind(dt, data.table(as.integer(max_id_comb), matrix(x_explain_y_hat, nrow = 1)), use.names = FALSE)
+ dt <- rbind(dt, data.table(as.integer(max_id_coal), matrix(x_explain_y_hat, nrow = 1)), use.names = FALSE)
# Need to add column names if batch S only contains the grand coalition
- if (length(S) == 1) setnames(dt, c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain))))
+ if (length(S) == 1) setnames(dt, c("id_coalition", paste0("p_hat1_", seq_len(internal$parameters$n_explain))))
}
- # Set id_combination to be the key
- setkey(dt, id_combination)
+ # Set id_coalition to be the key
+ setkey(dt, id_coalition)
return(dt)
}
@@ -133,26 +196,30 @@ batch_prepare_vS_MC <- function(S, internal, model, predict_model) {
#' @keywords internal
batch_prepare_vS_MC_auxiliary <- function(S, internal) {
- max_id_combination <- internal$parameters$n_combinations
+ iter <- length(internal$iter_list)
+
+ X <- internal$iter_list[[iter]]$X
+
+ max_id_coalition <- X[, .N]
x_explain <- internal$data$x_explain
n_explain <- internal$parameters$n_explain
# TODO: Check what is the fastest approach to deal with the last observation.
- # Not doing this for the largest id combination (should check if this is faster or slower, actually)
+ # Not doing this for the largest id_coalition (should check if this is faster or slower, actually)
# An alternative would be to delete rows from the dt which is provided by prepare_data.
- if (!(max_id_combination %in% S)) {
+ if (!(max_id_coalition %in% S)) {
# TODO: Need to handle the need for model for the AIC-versions here (skip for Python)
dt <- prepare_data(internal, index_features = S)
} else {
if (length(S) > 1) {
- S <- S[S != max_id_combination]
+ S <- S[S != max_id_coalition]
dt <- prepare_data(internal, index_features = S)
} else {
dt <- NULL # Special case for when the batch only include the largest id
}
- dt_max <- data.table(id_combination = max_id_combination, x_explain, w = 1, id = seq_len(n_explain))
+ dt_max <- data.table(id_coalition = max_id_coalition, x_explain, w = 1, id = seq_len(n_explain))
dt <- rbind(dt, dt_max)
- setkey(dt, id, id_combination)
+ setkey(dt, id, id_coalition)
}
return(dt)
}
@@ -193,13 +260,48 @@ compute_preds <- function(
compute_MCint <- function(dt, pred_cols = "p_hat") {
# Calculate contributions
- dt_res <- dt[, lapply(.SD, function(x) sum(((x) * w) / sum(w))), .(id, id_combination), .SDcols = pred_cols]
- data.table::setkeyv(dt_res, c("id", "id_combination"))
- dt_mat <- data.table::dcast(dt_res, id_combination ~ id, value.var = pred_cols)
+ dt_res <- dt[, lapply(.SD, function(x) sum(((x) * w) / sum(w))), .(id, id_coalition), .SDcols = pred_cols]
+ data.table::setkeyv(dt_res, c("id", "id_coalition"))
+ dt_mat <- data.table::dcast(dt_res, id_coalition ~ id, value.var = pred_cols)
if (length(pred_cols) == 1) {
names(dt_mat)[-1] <- paste0(pred_cols, "_", names(dt_mat)[-1])
}
- # dt_mat[, id_combination := NULL]
+ # dt_mat[, id_coalition := NULL]
return(dt_mat)
}
+
+
+#' Computes `v(S)` for all features subsets `S`.
+#'
+#' @inheritParams default_doc
+#' @inheritParams explain
+#'
+#' @param method Character
+#' Indicates whether the lappy method (default) or loop method should be used.
+#'
+#' @export
+compute_vS_forecast <- function(internal, model, predict_model, method = "future") {
+ # old function used only for forecast temporary
+ S_batch <- internal$objects$S_batch
+
+
+
+ if (method == "future") {
+ ret <- future_compute_vS_batch(S_batch = S_batch, internal = internal, model = model, predict_model = predict_model)
+ } else {
+ # Doing the same as above without future without progressbar or paralellization
+ ret <- list()
+ for (i in seq_along(S_batch)) {
+ S <- S_batch[[i]]
+ ret[[i]] <- batch_compute_vS(
+ S = S,
+ internal = internal,
+ model = model,
+ predict_model = predict_model
+ )
+ }
+ }
+
+ return(ret)
+}
diff --git a/R/documentation.R b/R/documentation.R
index 5fb5d0af6..8dbacf7af 100644
--- a/R/documentation.R
+++ b/R/documentation.R
@@ -2,7 +2,8 @@
#'
#' @param internal List.
#' Holds all parameters, data, functions and computed objects used within [explain()]
-#' The list contains one or more of the elements `parameters`, `data`, `objects`, `output`.
+#' The list contains one or more of the elements `parameters`, `data`, `objects`, `iter_list`, `timing_list`,
+#' `main_timing_list`, `output`, `iter_timing_list` and `iter_results`.
#'
#' @param model Objects.
#' The model object that ought to be explained.
@@ -30,10 +31,11 @@ default_doc <- function(internal, model, predict_model, output_size, extra, ...)
#' Exported documentation helper function.
#'
-#' @param internal Not used.
+#' @param internal List.
+#' Not used directly, but passed through from [explain()].
#'
-#' @param index_features Positive integer vector. Specifies the indices of combinations to
-#' apply to the present method. `NULL` means all combinations. Only used internally.
+#' @param index_features Positive integer vector. Specifies the id_coalition to
+#' apply to the present method. `NULL` means all coalitions. Only used internally.
#'
#' @keywords internal
default_doc_explain <- function(internal, index_features) {
@@ -46,7 +48,7 @@ default_doc_explain <- function(internal, index_features) {
#' @description
#' This helper function displays the specific arguments applicable to the different
#' approaches. Note that when calling [shapr::explain()] from Python, the parameters
-#' are renamed from the form `approach.parameter_name` to `approach_parameter_name`.
+#' are renamed from the from `approach.parameter_name` to `approach_parameter_name`.
#' That is, an underscore has replaced the dot as the dot is reserved in Python.
#'
#' @inheritDotParams setup_approach.independence -internal
diff --git a/R/explain.R b/R/explain.R
index 3e1e10c97..7a8d23540 100644
--- a/R/explain.R
+++ b/R/explain.R
@@ -27,11 +27,13 @@
#' Typically we set this value equal to the mean of the response variable in our training data, but other choices
#' such as the mean of the predictions in the training data are also reasonable.
#'
-#' @param n_combinations Integer.
-#' If `group = NULL`, `n_combinations` represents the number of unique feature combinations to sample.
-#' If `group != NULL`, `n_combinations` represents the number of unique group combinations to sample.
-#' If `n_combinations = NULL`, the exact method is used and all combinations are considered.
-#' The maximum number of combinations equals `2^m`, where `m` is the number of features.
+#' @param max_n_coalitions Integer.
+#' The upper limit on the number of unique feature/group coalitions to use in the adaptive procedure
+#' (if `adaptive = TRUE`).
+#' If `adaptive = FALSE` it represents the number of feature/group coalitions to use directly.
+#' The quantity refers to the number of unique feature coalitions if `group = NULL`,
+#' and group coalitions if `group != NULL`.
+#' `max_n_coalitions = NULL` corresponds to `max_n_coalitions=2^n_features`.
#'
#' @param group List.
#' If `NULL` regular feature wise Shapley values are computed.
@@ -39,39 +41,34 @@
#' the number of groups. The list element contains character vectors with the features included
#' in each of the different groups.
#'
-#' @param n_samples Positive integer.
-#' Indicating the maximum number of samples to use in the
-#' Monte Carlo integration for every conditional expectation. See also details.
-#'
-#' @param n_batches Positive integer (or NULL).
-#' Specifies how many batches the total number of feature combinations should be split into when calculating the
-#' contribution function for each test observation.
-#' The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed,
-#' which depends on `approach` and `n_combinations`.
-#' For models with many features, increasing the number of batches reduces the RAM allocation significantly.
-#' This typically comes with a small increase in computation time.
+#' @param n_MC_samples Positive integer.
+#' Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation.
+#' For `approach="ctree"`, `n_MC_samples` corresponds to the number of samples
+#' from the leaf node (see an exception related to the `ctree.sample` argument [shapr::setup_approach.ctree()]).
+#' For `approach="empirical"`, `n_MC_samples` is the \eqn{K} parameter in equations (14-15) of
+#' Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
+#' `empirical.eta` argument [shapr::setup_approach.empirical()].
#'
#' @param seed Positive integer.
#' Specifies the seed before any randomness based code is being run.
-#' If `NULL` the seed will be inherited from the calling environment.
+#' If `NULL` no seed is set in the calling environment.
#'
#' @param keep_samp_for_vS Logical.
-#' Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned
-#' (in `internal$output`)
+#' Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned (in `internal$output`).
+#' Not used for `approach="regression_separate"` or `approach="regression_surrogate"`.
#'
#' @param predict_model Function.
#' The prediction function used when `model` is not natively supported.
-#' (Run [get_supported_models()] for a list of natively supported
-#' models.)
+#' (Run [get_supported_models()] for a list of natively supported models.)
#' The function must have two arguments, `model` and `newdata` which specify, respectively, the model
-#' and a data.frame/data.table to compute predictions for. The function must give the prediction as a numeric vector.
+#' and a data.frame/data.table to compute predictions for.
+#' The function must give the prediction as a numeric vector.
#' `NULL` (the default) uses functions specified internally.
#' Can also be used to override the default function for natively supported model classes.
#'
#' @param get_model_specs Function.
#' An optional function for checking model/data consistency when `model` is not natively supported.
-#' (Run [get_supported_models()] for a list of natively supported
-#' models.)
+#' (Run [get_supported_models()] for a list of natively supported models.)
#' The function takes `model` as argument and provides a list with 3 elements:
#' \describe{
#' \item{labels}{Character vector with the names of each feature.}
@@ -82,19 +79,65 @@
#' disabled for unsupported model classes.
#' Can also be used to override the default function for natively supported model classes.
#'
-#' @param MSEv_uniform_comb_weights Logical. If `TRUE` (default), then the function weights the combinations
-#' uniformly when computing the MSEv criterion. If `FALSE`, then the function use the Shapley kernel weights to
-#' weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the
-#' sampling frequency when not all combinations are considered.
-#'
-#' @param timing Logical.
-#' Whether the timing of the different parts of the `explain()` should saved in the model object.
-#'
-#' @param verbose An integer specifying the level of verbosity. If `0`, `shapr` will stay silent.
-#' If `1`, it will print information about performance. If `2`, some additional information will be printed out.
-#' Use `0` (default) for no verbosity, `1` for low verbose, and `2` for high verbose.
-#' TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.
-#'
+#' @param MSEv_uniform_comb_weights Logical.
+#' If `TRUE` (default), then the function weights the coalitions uniformly when computing the MSEv criterion.
+#' If `FALSE`, then the function use the Shapley kernel weights to weight the coalitions when computing the MSEv
+#' criterion.
+#' Note that the Shapley kernel weights are replaced by the sampling frequency when not all coalitions are considered.
+#'
+#' @param verbose String vector or NULL.
+#' Specifies the verbosity (printout detail level) through one or more of strings `"basic"`, `"progress"`,
+#' `"convergence"`, `"shapley"` and `"vS_details"`.
+#' `"basic"` (default) displays basic information about the computation which is being performed.
+#' `"progress` displays information about where in the calculation process the function currently is.
+#' #' `"convergence"` displays information on how close to convergence the Shapley value estimates are
+#' (only when `adaptive = TRUE`) .
+#' `"shapley"` displays intermediate Shapley value estimates and standard deviations (only when `adaptive = TRUE`)
+#' + the final estimates.
+#' `"vS_details"` displays information about the v_S estimates.
+#' This is most relevant for `approach %in% c("regression_separate", "regression_surrogate", "vaeac"`).
+#' `NULL` means no printout.
+#' Note that any combination of four strings can be used.
+#' E.g. `verbose = c("basic", "vS_details")` will display basic information + details about the vS estimation process.
+#'
+#' @param paired_shap_sampling Logical.
+#' If `TRUE` (default), paired versions of all sampled coalitions are also included in the computation.
+#' That is, if there are 5 features and e.g. coalitions (1,3,5) are sampled, then also coalition (2,4) is used for
+#' computing the Shapley values. This is done to reduce the variance of the Shapley value estimates.
+#'
+#' @param adaptive Logical or NULL
+#' If `NULL` (default), the argument is set to `TRUE` if there are more than 5 features/groups, and `FALSE` otherwise.
+#' If eventually `TRUE`, the Shapley values are estimated adaptively in an iterative manner.
+#' This provides sufficiently accurate Shapley value estimates faster.
+#' First an initial number of coalitions is sampled, then bootsrapping is used to estimate the variance of the Shapley
+#' values.
+#' A convergence criterion is used to determine if the variances of the Shapley values are sufficently small.
+#' If the variances are too high, we estimate the number of required samples to reach convergence, and thereby add more
+#' coalitions.
+#' The process is repeated until the variances are below the threshold.
+#' Specifics related to the adaptive process and convergence criterion are set through `adaptive_arguments`.
+#'
+#' @param adaptive_arguments Named list.
+#' Specifices the arguments for the adaptive procedure.
+#' See [shapr::get_adaptive_arguments_default()] for description of the arguments and their default values.
+#' @param shapley_reweighting String.
+#' How to reweight the sampling frequency weights in the kernelSHAP solution after sampling, with the aim of reducing
+#' the randomness and thereby the variance of the Shapley value estimates.
+#' One of `'none'`, `'on_N'`, `'on_all'`, `'on_all_cond'` (default).
+#' `'none'` means no reweighting, i.e. the sampling frequency weights are used as is.
+#' `'on_coal_size'` means the sampling frequencies are averaged over all coalitions of the same size.
+#' `'on_N'` means the sampling frequencies are averaged over all coalitions with the same original sampling
+#' probabilities.
+#' `'on_all'` means the original sampling probabilities are used for all coalitions.
+#' `'on_all_cond'` means the original sampling probabilities are used for all coalitions, while adjusting for the
+#' probability that they are sampled at least once.
+#' This method is preferred as it has performed the best in simulation studies.
+#'
+#' @param prev_shapr_object `shapr` object or string.
+#' If an object of class `shapr` is provided or string with a path to where intermediate results are strored,
+#' then the function will use the previous object to continue the computation.
+#' This is useful if the computation is interrupted or you want higher accuracy than already obtained, and therefore
+#' want to continue the adaptive estimation. See the vignette for examples.
#' @param ... Further arguments passed to specific approaches
#'
#' @inheritDotParams setup_approach.empirical
@@ -108,57 +151,43 @@
#' @inheritDotParams setup_approach.regression_surrogate
#' @inheritDotParams setup_approach.timeseries
#'
-#' @details The most important thing to notice is that `shapr` has implemented eight different
-#' Monte Carlo-based approaches for estimating the conditional distributions of the data, namely `"empirical"`,
-#' `"gaussian"`, `"copula"`, `"ctree"`, `"vaeac"`, `"categorical"`, `"timeseries"`, and `"independence"`.
-#' `shapr` has also implemented two regression-based approaches `"regression_separate"` and `"regression_surrogate"`,
-#' and see the separate vignette on the regression-based approaches for more information.
-#' In addition, the user also has the option of combining the different Monte Carlo-based approaches.
-#' E.g., if you're in a situation where you have trained a model that consists of 10 features,
-#' and you'd like to use the `"gaussian"` approach when you condition on a single feature,
-#' the `"empirical"` approach if you condition on 2-5 features, and `"copula"` version
-#' if you condition on more than 5 features this can be done by simply passing
-#' `approach = c("gaussian", rep("empirical", 4), rep("copula", 4))`. If
-#' `"approach[i]" = "gaussian"` means that you'd like to use the `"gaussian"` approach
-#' when conditioning on `i` features. Conditioning on all features needs no approach as that is given
-#' by the complete prediction itself, and should thus not be part of the vector.
-#'
-#' For `approach="ctree"`, `n_samples` corresponds to the number of samples
-#' from the leaf node (see an exception related to the `sample` argument).
-#' For `approach="empirical"`, `n_samples` is the \eqn{K} parameter in equations (14-15) of
-#' Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
-#' `empirical.eta` argument.
-#'
+#' @details The `shapr` package implements kernelSHAP estimation of dependence-aware Shapley values with
+#' eight different Monte Carlo-based approaches for estimating the conditional distributions of the data, namely
+#' `"empirical"`, `"gaussian"`, `"copula"`, `"ctree"`, `"vaeac"`, `"categorical"`, `"timeseries"`, and `"independence"`.
+#' `shapr` has also implemented two regression-based approaches `"regression_separate"` and `"regression_surrogate"`.
+#' It is also possible to combine the different approaches, see the vignettes for more information.
+#'
+#' The package allows for parallelized computation with progress updates through the tightly connected
+#' [future::future] and [progressr::progressr] packages. See the examples below.
+#' For adaptive estimation (`adaptive=TRUE`), intermediate results may also be printed to the console
+#' (according to the `verbose` argument).
+#' Moreover, the intermediate results are written to disk.
+#' This combined with adaptive estimation with (optional) intermediate results printed to the console (and temporary
+#' written to disk, and batch computing of the v(S) values, enables fast and accurate estimation of the Shapley values
+#' in a memory friendly manner.
#'
#' @return Object of class `c("shapr", "list")`. Contains the following items:
#' \describe{
-#' \item{shapley_values}{data.table with the estimated Shapley values}
-#' \item{internal}{List with the different parameters, data and functions used internally}
+#' \item{shapley_values}{data.table with the estimated Shapley values with explained observation in the rows and
+#' features along the columns.
+#' The column `none` is the prediction not devoted to any of the features (given by the argument `prediction_zero`)}
+#' \item{shapley_values_sd}{data.table with the standard deviation of the Shapley values reflecting the uncertainty.
+#' Note that this only reflects the coalition sampling part of the kernelSHAP procedure, and is therefore by
+#' definition 0 when all coalitions is used.
+#' Only present when `adaptive = TRUE` and `adaptive_arguments$compute_sd=TRUE`.}
+#' \item{internal}{List with the different parameters, data, functions and other output used internally.}
#' \item{pred_explain}{Numeric vector with the predictions for the explained observations}
-#' \item{MSEv}{List with the values of the MSEv evaluation criterion for the approach.}
+#' \item{MSEv}{List with the values of the MSEv evaluation criterion for the approach. See the
+#' \href{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#msev-evaluation-criterion
+#' }{MSEv evaluation section in the vignette for details}.}
+#' \item{timing}{List containing timing information for the different parts of the computation.
+#' `init_time` and `end_time` gives the time stamps for the start and end of the computation.
+#' `total_time_secs` gives the total time in seconds for the complete execution of `explain()`.
+#' `main_timing_secs` gives the time in seconds for the main computations.
+#' `iter_timing_secs` gives for each iteration of the adaptive estimation, the time spent on the different parts
+#' adaptive estimation routine.}
#' }
#'
-#' `shapley_values` is a data.table where the number of rows equals
-#' the number of observations you'd like to explain, and the number of columns equals `m +1`,
-#' where `m` equals the total number of features in your model.
-#'
-#' If `shapley_values[i, j + 1] > 0` it indicates that the j-th feature increased the prediction for
-#' the i-th observation. Likewise, if `shapley_values[i, j + 1] < 0` it indicates that the j-th feature
-#' decreased the prediction for the i-th observation.
-#' The magnitude of the value is also important to notice. E.g. if `shapley_values[i, k + 1]` and
-#' `shapley_values[i, j + 1]` are greater than `0`, where `j != k`, and
-#' `shapley_values[i, k + 1]` > `shapley_values[i, j + 1]` this indicates that feature
-#' `j` and `k` both increased the value of the prediction, but that the effect of the k-th
-#' feature was larger than the j-th feature.
-#'
-#' The first column in `dt`, called `none`, is the prediction value not assigned to any of the features
-#' (\ifelse{html}{\eqn{\phi}\out{0}}{\eqn{\phi_0}}).
-#' It's equal for all observations and set by the user through the argument `prediction_zero`.
-#' The difference between the prediction and `none` is distributed among the other features.
-#' In theory this value should be the expected prediction without conditioning on any features.
-#' Typically we set this value equal to the mean of the response variable in our training data, but other choices
-#' such as the mean of the predictions in the training data are also reasonable.
-#'
#' @examples
#'
#' # Load example data
@@ -181,6 +210,18 @@
#' # Explain predictions
#' p <- mean(data_train[, y_var])
#'
+#' \dontrun{
+#' # (Optionally) enable parallelization via the future package
+#' if (requireNamespace("future", quietly = TRUE)) {
+#' future::plan("multisession", workers = 2)
+#' }
+#' }
+#'
+#' # (Optionally) enable progress updates within every iteration via the progressr package
+#' if (requireNamespace("progressr", quietly = TRUE)) {
+#' progressr::handlers(global = TRUE)
+#' }
+#'
#' # Empirical approach
#' explain1 <- explain(
#' model = model,
@@ -188,7 +229,7 @@
#' x_train = x_train,
#' approach = "empirical",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Gaussian approach
@@ -198,7 +239,7 @@
#' x_train = x_train,
#' approach = "gaussian",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Gaussian copula approach
@@ -208,7 +249,7 @@
#' x_train = x_train,
#' approach = "copula",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # ctree approach
@@ -218,7 +259,7 @@
#' x_train = x_train,
#' approach = "ctree",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Combined approach
@@ -229,7 +270,7 @@
#' x_train = x_train,
#' approach = approach,
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Print the Shapley values
@@ -251,7 +292,7 @@
#' group = group_list,
#' approach = "empirical",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#' print(explain_groups$shapley_values)
#'
@@ -279,6 +320,21 @@
#' regression.model = parsnip::linear_reg()
#' )
#'
+#' ## Adaptive estimation
+#' # For illustration purposes only. By default not used for such small dimensions as here
+#'
+#' # Gaussian approach
+#' explain_adaptive <- explain(
+#' model = model,
+#' x_explain = x_explain,
+#' x_train = x_train,
+#' approach = "gaussian",
+#' prediction_zero = p,
+#' n_MC_samples = 1e2,
+#' adaptive = TRUE,
+#' adaptive_arguments = list(initial_n_coalitions = 10)
+#' )
+#'
#' @export
#'
#' @author Martin Jullum, Lars Henry Berge Olsen
@@ -290,24 +346,30 @@ explain <- function(model,
x_explain,
x_train,
approach,
+ paired_shap_sampling = TRUE,
prediction_zero,
- n_combinations = NULL,
+ max_n_coalitions = NULL,
+ adaptive = NULL,
group = NULL,
- n_samples = 1e3,
- n_batches = NULL,
+ n_MC_samples = 1e3,
seed = 1,
keep_samp_for_vS = FALSE,
predict_model = NULL,
get_model_specs = NULL,
MSEv_uniform_comb_weights = TRUE,
- timing = TRUE,
- verbose = 0,
+ verbose = "basic",
+ adaptive_arguments = list(),
+ shapley_reweighting = "on_all_cond",
+ prev_shapr_object = NULL,
...) { # ... is further arguments passed to specific approaches
- timing_list <- list(init_time = Sys.time())
- set.seed(seed)
+ init_time <- Sys.time()
+
+ if (!is.null(seed)) {
+ set.seed(seed)
+ }
# Gets and check feature specs from the model
feature_specs <- get_feature_specs(get_model_specs, model)
@@ -318,21 +380,24 @@ explain <- function(model,
x_train = x_train,
x_explain = x_explain,
approach = approach,
+ paired_shap_sampling = paired_shap_sampling,
prediction_zero = prediction_zero,
- n_combinations = n_combinations,
+ max_n_coalitions = max_n_coalitions,
group = group,
- n_samples = n_samples,
- n_batches = n_batches,
+ n_MC_samples = n_MC_samples,
seed = seed,
keep_samp_for_vS = keep_samp_for_vS,
feature_specs = feature_specs,
MSEv_uniform_comb_weights = MSEv_uniform_comb_weights,
- timing = timing,
verbose = verbose,
+ adaptive = adaptive,
+ adaptive_arguments = adaptive_arguments,
+ shapley_reweighting = shapley_reweighting,
+ init_time = init_time,
+ prev_shapr_object = prev_shapr_object,
...
)
- timing_list$setup <- Sys.time()
# Gets predict_model (if not passed to explain)
predict_model <- get_predict_model(predict_model = predict_model, model = model)
@@ -345,55 +410,104 @@ explain <- function(model,
internal = internal
)
- timing_list$test_prediction <- Sys.time()
+ internal$timing_list$test_prediction <- Sys.time()
+
+
+ internal <- additional_regression_setup(internal, model = model, predict_model = predict_model)
+ # Not called for approach %in% c("regression_surrogate","vaeac")
+ internal <- setup_approach(internal, model = model, predict_model = predict_model)
- # Add the predicted response of the training and explain data to the internal list for regression-based methods.
- # Use isTRUE as `regression` is not present (NULL) for non-regression methods (i.e., Monte Carlo-based methods).
- if (isTRUE(internal$parameters$regression)) {
- internal <- regression.get_y_hat(internal = internal, model = model, predict_model = predict_model)
+ internal$main_timing_list <- internal$timing_list
+
+ converged <- FALSE
+ iter <- length(internal$iter_list)
+
+ if (!is.null(seed)) {
+ set.seed(seed)
}
- # Sets up the Shapley (sampling) framework and prepares the
- # conditional expectation computation for the chosen approach
- # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters
- internal <- setup_computation(internal, model, predict_model)
+ cli_startup(internal, model, verbose)
+
+
+ while (converged == FALSE) {
+ cli_iter(verbose, internal, iter)
+
+ internal$timing_list <- list(init = Sys.time())
+
+ # Setup the Shapley framework
+ internal <- shapley_setup(internal)
+
+ # Only actually called for approach %in% c("regression_surrogate","vaeac")
+ internal <- setup_approach(internal, model = model, predict_model = predict_model)
- timing_list$setup_computation <- Sys.time()
+ # Compute the vS
+ vS_list <- compute_vS(internal, model, predict_model)
- # Compute the v(S):
- # MC:
- # 1. Get the samples for the conditional distributions with the specified approach
- # 2. Predict with these samples
- # 3. Perform MC integration on these to estimate the conditional expectation (v(S))
- # Regression:
- # 1. Directly estimate the conditional expectation (v(S)) using the fitted regression model(s)
- vS_list <- compute_vS(internal, model, predict_model)
+ # Compute shapley value estimated and bootstrapped standard deviations
+ internal <- compute_estimates(internal, vS_list)
- timing_list$compute_vS <- Sys.time()
+ # Check convergence based on estimates and standard deviations (and thresholds)
+ internal <- check_convergence(internal)
- # Compute Shapley values based on conditional expectations (v(S))
- # Organize function output
- output <- finalize_explanation(vS_list = vS_list, internal = internal)
+ # Save intermediate results
+ save_results(internal)
+
+ # Preparing parameters for next iteration (does not do anything if already converged)
+ internal <- prepare_next_iteration(internal)
+
+ # Printing iteration information
+ print_iter(internal)
+
+ ### Setting globals for to simplify the loop
+ converged <- internal$iter_list[[iter]]$converged
+
+ internal$timing_list$postprocess_res <- Sys.time()
+
+ internal$iter_timing_list[[iter]] <- internal$timing_list
+
+ iter <- iter + 1
+ }
- timing_list$shapley_computation <- Sys.time()
+ internal$main_timing_list$main_computation <- Sys.time()
+
+
+ # Rerun after convergence to get the same output format as for the non-adaptive approach
+ output <- finalize_explanation(internal = internal)
+
+ internal$main_timing_list$finalize_explanation <- Sys.time()
+
+ output$timing <- compute_time(internal)
+
+
+ # Some cleanup when doing testing
+ testing <- internal$parameters$testing
+ if (isTRUE(testing)) {
+ output <- testing_cleanup(output)
+ }
- # Compute the elapsed time for the different steps
- if (timing == TRUE) output$timing <- compute_time(timing_list)
- # Temporary to avoid failing tests
- output <- remove_outputs_to_pass_tests(output)
return(output)
}
+#' Cleans out certain output arguments to allow perfect reproducability of the output
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
#' @keywords internal
-#' @author Lars Henry Berge Olsen
-remove_outputs_to_pass_tests <- function(output) {
- output$internal$objects$id_combination_mapper_dt <- NULL
- output$internal$objects$cols_per_horizon <- NULL
- output$internal$objects$W_list <- NULL
+#' @author Lars Henry Berge Olsen, Martin Jullum
+testing_cleanup <- function(output) {
+ # Removing the timing of different function calls
+ output$timing <- NULL
+ # Clearing out the timing lists as well
+ output$internal$main_timing_list <- NULL
+ output$internal$iter_timing_list <- NULL
+ output$internal$timing_list <- NULL
+
+ # Removing paths to non-reproducable vaeac model objects
if (isFALSE(output$internal$parameters$vaeac.extra_parameters$vaeac.save_model)) {
output$internal$parameters[c(
"vaeac", "vaeac.sampler", "vaeac.model", "vaeac.activation_function", "vaeac.checkpoint"
@@ -402,8 +516,15 @@ remove_outputs_to_pass_tests <- function(output) {
NULL
}
- # Remove the `regression` parameter from the output list when we are not doing regression
- if (isFALSE(output$internal$parameters$regression)) output$internal$parameters$regression <- NULL
+ # Removing the fit times for regression surrogate models
+ if ("regression_surrogate" %in% output$internal$parameters$approach) {
+ # Deletes the fit_times for approach = regression_surrogate to make tests pass.
+ # In the future we could delete this only when a new argument in explain called testing is TRUE
+ output$internal$objects$regression.surrogate_model$pre$mold$blueprint$recipe$fit_times <- NULL
+ }
+
+ # Delete the saving_path
+ output$internal$parameters$adaptive_arguments$saving_path <- NULL
return(output)
}
diff --git a/R/explain_forecast.R b/R/explain_forecast.R
index f182e0c63..2621b816d 100644
--- a/R/explain_forecast.R
+++ b/R/explain_forecast.R
@@ -94,17 +94,15 @@ explain_forecast <- function(model,
horizon,
approach,
prediction_zero,
- n_combinations = NULL,
+ max_n_coalitions = NULL,
group_lags = TRUE,
group = NULL,
- n_samples = 1e3,
- n_batches = NULL,
+ n_MC_samples = 1e3,
seed = 1,
keep_samp_for_vS = FALSE,
predict_model = NULL,
get_model_specs = NULL,
- timing = TRUE,
- verbose = 0,
+ verbose = "basic",
...) { # ... is further arguments passed to specific approaches
timing_list <- list(
init_time = Sys.time()
@@ -128,9 +126,10 @@ explain_forecast <- function(model,
approach = approach,
prediction_zero = prediction_zero,
output_size = horizon,
- n_combinations = n_combinations,
- n_samples = n_samples,
- n_batches = n_batches,
+ max_n_coalitions = max_n_coalitions,
+ n_MC_samples = n_MC_samples,
+ # n_batches = n_batches, # TODO: This is not used anymore, but the code does not use the adaptive version of it
+ # either I think... I have now just set it to always 10 in the create_S_batches_forecast function.
seed = seed,
keep_samp_for_vS = keep_samp_for_vS,
feature_specs = feature_specs,
@@ -144,7 +143,6 @@ explain_forecast <- function(model,
explain_xreg_lags = explain_xreg_lags,
group_lags = group_lags,
group = group,
- timing = timing,
verbose = verbose,
...
)
@@ -177,49 +175,39 @@ explain_forecast <- function(model,
timing_list$setup_computation <- Sys.time()
+ ### Temporary solution for forecast
+ internal$iter_list[[1]]$X <- internal$objects$X
+ internal$iter_list[[1]]$S <- internal$objects$S
+
+
# Compute the v(S):
# Get the samples for the conditional distributions with the specified approach
# Predict with these samples
# Perform MC integration on these to estimate the conditional expectation (v(S))
- vS_list <- compute_vS(internal, model, predict_model, method = "regular")
+ vS_list <- compute_vS_forecast(internal, model, predict_model, method = "regular")
timing_list$compute_vS <- Sys.time()
# Compute Shapley values based on conditional expectations (v(S))
# Organize function output
- output <- finalize_explanation(
+ output <- finalize_explanation_forecast(
vS_list = vS_list,
internal = internal
)
- if (timing == TRUE) {
- output$timing <- compute_time(timing_list)
- }
-
- # Temporary to avoid failing tests
- output <- remove_outputs_pass_tests_fore(output)
+ output$timing <- compute_time(timing_list)
- return(output)
-}
-#' @keywords internal
-#' @author Lars Henry Berge Olsen
-remove_outputs_pass_tests_fore <- function(output) {
- # Temporary to avoid failing tests related to vaeac approach
- if (isFALSE(output$internal$parameters$vaeac.extra_parameters$vaeac.save_model)) {
- output$internal$parameters[c(
- "vaeac", "vaeac.sampler", "vaeac.model", "vaeac.activation_function", "vaeac.checkpoint"
- )] <- NULL
- output$internal$parameters$vaeac.extra_parameters[c("vaeac.folder_to_save_model", "vaeac.model_description")] <-
- NULL
+ # Some cleanup when doing testing
+ testing <- internal$parameters$testing
+ if (isTRUE(testing)) {
+ output <- testing_cleanup(output)
}
- # Remove the `regression` parameter from the output list when we are not doing regression
- if (isFALSE(output$internal$parameters$regression)) output$internal$parameters$regression <- NULL
-
return(output)
}
+
#' Set up data for explain_forecast
#'
#' @param y A matrix or numeric vector containing the endogenous variables for the model.
diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R
index 00a074751..b98a7bde9 100644
--- a/R/finalize_explanation.R
+++ b/R/finalize_explanation.R
@@ -1,106 +1,94 @@
-#' Computes the Shapley values given `v(S)`
+#' Gathers the final output to create the explanation object
#'
-#' @inherit explain
-#' @inheritParams default_doc
-#' @param vS_list List
-#' Output from [compute_vS()]
+#' @inheritParams default_doc_explain
#'
#' @export
-finalize_explanation <- function(vS_list, internal) {
+finalize_explanation <- function(internal) {
MSEv_uniform_comb_weights <- internal$parameters$MSEv_uniform_comb_weights
+ output_size <- internal$parameters$output_size
+ dt_vS <- internal$output$dt_vS
- processed_vS_list <- postprocess_vS_list(
- vS_list = vS_list,
- internal = internal
- )
+ # Extracting iter (and deleting the last temporary empty list of iter_list)
+ iter <- length(internal$iter_list) - 1
+ internal$iter_list[[iter + 1]] <- NULL
- # Extract the predictions we are explaining
- p <- get_p(processed_vS_list$dt_vS, internal)
+ dt_shapley_est <- internal$iter_list[[iter]]$dt_shapley_est
+ dt_shapley_sd <- internal$iter_list[[iter]]$dt_shapley_sd
+
+ # Setting parameters and objects used in the end from the last iteration
+ internal$objects$X <- internal$iter_list[[iter]]$X
+ internal$objects$S <- internal$iter_list[[iter]]$S
+ internal$objects$W <- internal$iter_list[[iter]]$W
- # internal$timing$postprocessing <- Sys.time()
- # Compute the Shapley values
- dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS)
- # internal$timing$shapley_computation <- Sys.time()
# Clearing out the tmp list with model and predict_model (only added for AICc-types of empirical approach)
internal$tmp <- NULL
- internal$output <- processed_vS_list
- output <- list(
- shapley_values = dt_shapley,
- internal = internal,
- pred_explain = p
- )
- attr(output, "class") <- c("shapr", "list")
+
+ # Extract the predictions we are explaining
+ p <- get_p(dt_vS, internal)
+
# Compute the MSEv evaluation criterion if the output of the predictive model is a scalar.
# TODO: check if it makes sense for output_size > 1.
- if (internal$parameters$output_size == 1) {
- output$MSEv <- compute_MSEv_eval_crit(
+ if (output_size == 1) {
+ MSEv <- compute_MSEv_eval_crit(
internal = internal,
- dt_vS = processed_vS_list$dt_vS,
+ dt_vS = dt_vS,
MSEv_uniform_comb_weights = MSEv_uniform_comb_weights
)
+ } else {
+ MSEv <- NULL
}
- return(output)
-}
-
-
-#' @keywords internal
-postprocess_vS_list <- function(vS_list, internal) {
- id_combination <- NULL # due to NSE
-
- keep_samp_for_vS <- internal$parameters$keep_samp_for_vS
- prediction_zero <- internal$parameters$prediction_zero
- n_explain <- internal$parameters$n_explain
-
- # Appending the zero-prediction to the list
- dt_vS0 <- as.data.table(rbind(c(1, rep(prediction_zero, n_explain))))
-
- # Extracting/merging the data tables from the batch running
- # TODO: Need a memory and speed optimized way to transform the output form dt_vS_list to two different lists,
- # I.e. without copying the data more than once. For now I have modified run_batch such that it
- # if keep_samp_for_vS=FALSE
- # then there is only one copy, but there are two if keep_samp_for_vS=TRUE. This might be OK since the
- # latter is used rarely
- if (keep_samp_for_vS) {
- names(dt_vS0) <- names(vS_list[[1]][[1]])
+ # Extract iterative results in a simplified format
+ internal$iter_results <- get_iter_results(internal$iter_list)
- vS_list[[length(vS_list) + 1]] <- list(dt_vS0, NULL)
-
- dt_vS <- rbindlist(lapply(vS_list, `[[`, 1))
+ output <- list(
+ shapley_values = dt_shapley_est,
+ shapley_values_sd = dt_shapley_sd,
+ internal = internal,
+ pred_explain = p,
+ MSEv = MSEv
+ )
+ attr(output, "class") <- c("shapr", "list")
- dt_samp_for_vS <- rbindlist(lapply(vS_list, `[[`, 2), use.names = TRUE)
+ return(output)
+}
- data.table::setorder(dt_samp_for_vS, id_combination)
- } else {
- names(dt_vS0) <- names(vS_list[[1]])
+get_iter_results <- function(iter_list) {
+ ret <- list()
+ ret$dt_iter_shapley_est <- rbindlist(lapply(iter_list, `[[`, "dt_shapley_est"), idcol = "iter")
+ ret$dt_iter_shapley_sd <- rbindlist(lapply(iter_list, `[[`, "dt_shapley_sd"), idcol = "iter")
+ ret$iter_info_dt <- iter_list_to_dt(iter_list)
+ return(ret)
+}
- vS_list[[length(vS_list) + 1]] <- dt_vS0
+iter_list_to_dt <- function(iter_list, what = c(
+ "exact", "compute_sd", "reduction_factor", "n_coalitions", "n_batches",
+ "converged", "converged_exact", "converged_sd", "converged_max_iter",
+ "est_required_coalitions", "est_remaining_coalitions", "overall_conv_measure"
+ )) {
+ extracted <- lapply(iter_list, function(x) x[what])
+ ret <- do.call(rbindlist, list(l = lapply(extracted, as.data.table), fill = TRUE))
+ return(ret)
+}
- dt_vS <- rbindlist(vS_list)
- dt_samp_for_vS <- NULL
- }
- data.table::setorder(dt_vS, id_combination)
- output <- list(
- dt_vS = dt_vS,
- dt_samp_for_vS = dt_samp_for_vS
- )
- return(output)
-}
#' @keywords internal
get_p <- function(dt_vS, internal) {
- id_combination <- NULL # due to NSE
+ id_coalition <- NULL # due to NSE
- max_id_combination <- internal$parameters$n_combinations
- p <- unlist(dt_vS[id_combination == max_id_combination, ][, id_combination := NULL])
+ iter <- length(internal$iter_list)
+ max_id_coalition <- internal$iter_list[[iter]]$n_coalitions
+
+
+ p <- unlist(dt_vS[id_coalition == max_id_coalition, ][, id_coalition := NULL])
if (internal$parameters$type == "forecast") {
names(p) <- apply(internal$parameters$output_labels, 1, function(x) paste0("explain_idx_", x[1], "_horizon_", x[2]))
@@ -109,89 +97,42 @@ get_p <- function(dt_vS, internal) {
return(p)
}
-#' Compute shapley values
-#' @param dt_vS The contribution matrix.
-#'
-#' @inheritParams default_doc
-#'
-#' @return A `data.table` with Shapley values for each test observation.
-#' @export
-#' @keywords internal
-compute_shapley_new <- function(internal, dt_vS) {
- is_groupwise <- internal$parameters$is_groupwise
- feature_names <- internal$parameters$feature_names
- W <- internal$objects$W
- type <- internal$parameters$type
-
- if (!is_groupwise) {
- shap_names <- feature_names
- } else {
- shap_names <- names(internal$parameters$group) # TODO: Add group_names (and feature_names) to internal earlier
- }
- # If multiple horizons with explain_forecast are used, we only distribute value to those used at each horizon
- if (type == "forecast") {
- id_combination_mapper_dt <- internal$objects$id_combination_mapper_dt
- horizon <- internal$parameters$horizon
- cols_per_horizon <- internal$objects$cols_per_horizon
- W_list <- internal$objects$W_list
- kshap_list <- list()
- for (i in seq_len(horizon)) {
- W0 <- W_list[[i]]
- dt_vS0 <- merge(dt_vS, id_combination_mapper_dt[horizon == i], by = "id_combination", all.y = TRUE)
- data.table::setorder(dt_vS0, horizon_id_combination)
- these_vS0_cols <- grep(paste0("p_hat", i, "_"), names(dt_vS0))
- kshap0 <- t(W0 %*% as.matrix(dt_vS0[, these_vS0_cols, with = FALSE]))
- kshap_list[[i]] <- data.table::as.data.table(kshap0)
- if (!is_groupwise) {
- names(kshap_list[[i]]) <- c("none", cols_per_horizon[[i]])
- } else {
- names(kshap_list[[i]]) <- c("none", shap_names)
- }
- }
- dt_kshap <- cbind(internal$parameters$output_labels, rbindlist(kshap_list, fill = TRUE))
- } else {
- kshap <- t(W %*% as.matrix(dt_vS[, -"id_combination"]))
- dt_kshap <- data.table::as.data.table(kshap)
- colnames(dt_kshap) <- c("none", shap_names)
- }
- return(dt_kshap)
-}
#' Mean Squared Error of the Contribution Function `v(S)`
#'
#' @inheritParams explain
#' @inheritParams default_doc
-#' @param dt_vS Data.table of dimension `n_combinations` times `n_explain + 1` containing the contribution function
-#' estimates. The first column is assumed to be named `id_combination` and containing the ids of the combinations.
-#' The last row is assumed to be the full combination, i.e., it contains the predicted responses for the observations
+#' @param dt_vS Data.table of dimension `n_coalitions` times `n_explain + 1` containing the contribution function
+#' estimates. The first column is assumed to be named `id_coalition` and containing the ids of the coalitions.
+#' The last row is assumed to be the full coalition, i.e., it contains the predicted responses for the observations
#' which are to be explained.
#' @param MSEv_skip_empty_full_comb Logical. If `TRUE` (default), we exclude the empty and grand
-#' combinations/coalitions when computing the MSEv evaluation criterion. This is reasonable as they are identical
+#' coalitions when computing the MSEv evaluation criterion. This is reasonable as they are identical
#' for all methods, i.e., their contribution function is independent of the used method as they are special cases not
-#' effected by the used method. If `FALSE`, we include the empty and grand combinations/coalitions. In this situation,
+#' effected by the used method. If `FALSE`, we include the empty and grand coalitions. In this situation,
#' we also recommend setting `MSEv_uniform_comb_weights = TRUE`, as otherwise the large weights for the empty and
-#' grand combinations/coalitions will outweigh all other combinations and make the MSEv criterion uninformative.
+#' grand coalitions will outweigh all other coalitions and make the MSEv criterion uninformative.
#'
#' @return
#' List containing:
#' \describe{
#' \item{`MSEv`}{A \code{\link[data.table]{data.table}} with the overall MSEv evaluation criterion averaged
-#' over both the combinations/coalitions and observations/explicands. The \code{\link[data.table]{data.table}}
-#' also contains the standard deviation of the MSEv values for each explicand (only averaged over the combinations)
+#' over both the coalitions and observations/explicands. The \code{\link[data.table]{data.table}}
+#' also contains the standard deviation of the MSEv values for each explicand (only averaged over the coalitions)
#' divided by the square root of the number of explicands.}
#' \item{`MSEv_explicand`}{A \code{\link[data.table]{data.table}} with the mean squared error for each
-#' explicand, i.e., only averaged over the combinations/coalitions.}
-#' \item{`MSEv_combination`}{A \code{\link[data.table]{data.table}} with the mean squared error for each
-#' combination/coalition, i.e., only averaged over the explicands/observations.
+#' explicand, i.e., only averaged over the coalitions.}
+#' \item{`MSEv_coalition`}{A \code{\link[data.table]{data.table}} with the mean squared error for each
+#' coalition, i.e., only averaged over the explicands/observations.
#' The \code{\link[data.table]{data.table}} also contains the standard deviation of the MSEv values for
-#' each combination divided by the square root of the number of explicands.}
+#' each coalition divided by the square root of the number of explicands.}
#' }
#'
#' @description Function that computes the Mean Squared Error (MSEv) of the contribution function
@@ -213,24 +154,28 @@ compute_MSEv_eval_crit <- function(internal,
dt_vS,
MSEv_uniform_comb_weights,
MSEv_skip_empty_full_comb = TRUE) {
+ iter <- length(internal$iter_list)
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+
n_explain <- internal$parameters$n_explain
- n_combinations <- internal$parameters$n_combinations
- id_combination_indices <- if (MSEv_skip_empty_full_comb) seq(2, n_combinations - 1) else seq(1, n_combinations)
- n_combinations_used <- length(id_combination_indices)
- features <- internal$objects$X$features[id_combination_indices]
+ id_coalition_indices <- if (MSEv_skip_empty_full_comb) seq(2, n_coalitions - 1) else seq(1, n_coalitions)
+ n_coalitions_used <- length(id_coalition_indices)
+
+ X <- internal$objects$X
+ coalitions <- X$coalitions[id_coalition_indices]
# Extract the predicted responses f(x)
- p <- unlist(dt_vS[id_combination == n_combinations, -"id_combination"])
+ p <- unlist(dt_vS[id_coalition == n_coalitions, -"id_coalition"])
# Create contribution matrix
- vS <- as.matrix(dt_vS[id_combination_indices, -"id_combination"])
+ vS <- as.matrix(dt_vS[id_coalition_indices, -"id_coalition"])
# Square the difference between the v(S) and f(x)
dt_squared_diff_original <- sweep(vS, 2, p)^2
# Get the weights
- averaging_weights <- if (MSEv_uniform_comb_weights) rep(1, n_combinations) else internal$objects$X$shapley_weight
- averaging_weights <- averaging_weights[id_combination_indices]
+ averaging_weights <- if (MSEv_uniform_comb_weights) rep(1, n_coalitions) else X$shapley_weight
+ averaging_weights <- averaging_weights[id_coalition_indices]
averaging_weights_scaled <- averaging_weights / sum(averaging_weights)
# Apply the `averaging_weights_scaled` to each column (i.e., each explicand)
@@ -241,8 +186,8 @@ compute_MSEv_eval_crit <- function(internal,
MSEv_explicand <- colSums(dt_squared_diff)
# The MSEv criterion for each coalition, i.e., only averaged over the explicands.
- MSEv_combination <- rowMeans(dt_squared_diff * n_combinations_used)
- MSEv_combination_sd <- apply(dt_squared_diff * n_combinations_used, 1, sd) / sqrt(n_explain)
+ MSEv_coalition <- rowMeans(dt_squared_diff * n_coalitions_used)
+ MSEv_coalition_sd <- apply(dt_squared_diff * n_coalitions_used, 1, sd) / sqrt(n_explain)
# The MSEv criterion averaged over both the coalitions and explicands.
MSEv <- mean(MSEv_explicand)
@@ -250,8 +195,8 @@ compute_MSEv_eval_crit <- function(internal,
# Set the name entries in the arrays
names(MSEv_explicand) <- paste0("id_", seq(n_explain))
- names(MSEv_combination) <- paste0("id_combination_", id_combination_indices)
- names(MSEv_combination_sd) <- paste0("id_combination_", id_combination_indices)
+ names(MSEv_coalition) <- paste0("id_coalition_", id_coalition_indices)
+ names(MSEv_coalition_sd) <- paste0("id_coalition_", id_coalition_indices)
# Convert the results to data.table
MSEv <- data.table(
@@ -262,16 +207,72 @@ compute_MSEv_eval_crit <- function(internal,
"id" = seq(n_explain),
"MSEv" = MSEv_explicand
)
- MSEv_combination <- data.table(
- "id_combination" = id_combination_indices,
- "features" = features,
- "MSEv" = MSEv_combination,
- "MSEv_sd" = MSEv_combination_sd
+ MSEv_coalition <- data.table(
+ "id_coalition" = id_coalition_indices,
+ "coalitions" = coalitions,
+ "MSEv" = MSEv_coalition,
+ "MSEv_sd" = MSEv_coalition_sd
)
return(list(
MSEv = MSEv,
MSEv_explicand = MSEv_explicand,
- MSEv_combination = MSEv_combination
+ MSEv_coalition = MSEv_coalition
))
}
+
+
+#' Computes the Shapley values given `v(S)`
+#'
+#' @inherit explain
+#' @inheritParams default_doc
+#' @param vS_list List
+#' Output from [compute_vS()]
+#'
+#' @export
+finalize_explanation_forecast <- function(vS_list, internal) { # Temporary used for forecast only (the old function)
+ MSEv_uniform_comb_weights <- internal$parameters$MSEv_uniform_comb_weights
+
+ processed_vS_list <- postprocess_vS_list(
+ vS_list = vS_list,
+ internal = internal
+ )
+
+ # Extract the predictions we are explaining
+ p <- get_p(processed_vS_list$dt_vS, internal)
+
+ # internal$timing$postprocessing <- Sys.time()
+
+ # Compute the Shapley values
+ dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS)
+
+ # internal$timing$shapley_computation <- Sys.time()
+
+ # Clearing out the timing lists as they are added to the output separately
+ internal$main_timing_list <- internal$iter_timing_list <- internal$timing_list <- NULL
+
+
+ # Clearing out the tmp list with model and predict_model (only added for AICc-types of empirical approach)
+ internal$tmp <- NULL
+
+ internal$output <- processed_vS_list
+
+ output <- list(
+ shapley_values = dt_shapley,
+ internal = internal,
+ pred_explain = p
+ )
+ attr(output, "class") <- c("shapr", "list")
+
+ # Compute the MSEv evaluation criterion if the output of the predictive model is a scalar.
+ # TODO: check if it makes sense for output_size > 1.
+ if (internal$parameters$output_size == 1) {
+ output$MSEv <- compute_MSEv_eval_crit(
+ internal = internal,
+ dt_vS = processed_vS_list$dt_vS,
+ MSEv_uniform_comb_weights = MSEv_uniform_comb_weights
+ )
+ }
+
+ return(output)
+}
diff --git a/R/plot.R b/R/plot.R
index af555c5e6..0a4830bdf 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -98,7 +98,7 @@
#' x_train = x_train,
#' approach = "empirical",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
@@ -148,7 +148,7 @@
#' x_train = x_train,
#' approach = "ctree",
#' prediction_zero = p,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
@@ -186,7 +186,7 @@ plot.shapr <- function(x,
is_groupwise <- x$internal$parameters$is_groupwise
# melting Kshap
- shap_names <- colnames(x$shapley_values)[-1]
+ shap_names <- x$internal$parameters$shap_names
dt_shap <- round(data.table::copy(x$shapley_values), digits = digits)
dt_shap[, id := .I]
dt_shap_long <- data.table::melt(dt_shap, id.vars = "id", value.name = "phi")
@@ -787,8 +787,8 @@ make_waterfall_plot <- function(dt_plot,
#' Make plots to visualize and compare the MSEv evaluation criterion for a list of
#' [shapr::explain()] objects applied to the same data and model. The function creates
#' bar plots and line plots with points to illustrate the overall MSEv evaluation
-#' criterion, but also for each observation/explicand and combination by only averaging over
-#' the combinations and observations/explicands, respectively.
+#' criterion, but also for each observation/explicand and coalition by only averaging over
+#' the coalitions and observations/explicands, respectively.
#'
#' @inheritParams plot.shapr
#' @inheritParams default_doc
@@ -796,26 +796,26 @@ make_waterfall_plot <- function(dt_plot,
#' @param explanation_list A list of [shapr::explain()] objects applied to the same data and model.
#' If the entries in the list are named, then the function use these names. Otherwise, they default to
#' the approach names (with integer suffix for duplicates) for the explanation objects in `explanation_list`.
-#' @param id_combination Integer vector. Which of the combinations (coalitions) to plot.
-#' E.g. if you used `n_combinations = 16` in [explain()], you can generate a plot for the
-#' first 5 combinations and the 10th by setting `id_combination = c(1:5, 10)`.
+#' @param id_coalition Integer vector. Which of the coalitions to plot.
+#' E.g. if you used `n_coalitions = 16` in [explain()], you can generate a plot for the
+#' first 5 coalitions and the 10th by setting `id_coalition = c(1:5, 10)`.
#' @param CI_level Positive numeric between zero and one. Default is `0.95` if the number of observations to explain is
#' larger than 20, otherwise `CI_level = NULL`, which removes the confidence intervals. The level of the approximate
-#' confidence intervals for the overall MSEv and the MSEv_combination. The confidence intervals are based on that
+#' confidence intervals for the overall MSEv and the MSEv_coalition. The confidence intervals are based on that
#' the MSEv scores are means over the observations/explicands, and that means are approximation normal. Since the
#' standard deviations are estimated, we use the quantile t from the T distribution with N_explicands - 1 degrees of
#' freedom corresponding to the provided level. Here, N_explicands is the number of observations/explicands.
-#' MSEv ± t*SD(MSEv)/sqrt(N_explicands). Note that the `explain()` function already scales the standard deviation by
-#' sqrt(N_explicands), thus, the CI are MSEv ± t*MSEv_sd, where the values MSEv and MSEv_sd are extracted from the
+#' MSEv +/- t*SD(MSEv)/sqrt(N_explicands). Note that the `explain()` function already scales the standard deviation by
+#' sqrt(N_explicands), thus, the CI are MSEv \/- t*MSEv_sd, where the values MSEv and MSEv_sd are extracted from the
#' MSEv data.tables in the objects in the `explanation_list`.
#' @param geom_col_width Numeric. Bar width. By default, set to 90% of the [ggplot2::resolution()] of the data.
#' @param plot_type Character vector. The possible options are "overall" (default), "comb", and "explicand".
#' If `plot_type = "overall"`, then the plot (one bar plot) associated with the overall MSEv evaluation criterion
-#' for each method is created, i.e., when averaging over both the combinations/coalitions and observations/explicands.
+#' for each method is created, i.e., when averaging over both the coalitions and observations/explicands.
#' If `plot_type = "comb"`, then the plots (one line plot and one bar plot) associated with the MSEv evaluation
-#' criterion for each combination/coalition are created, i.e., when we only average over the observations/explicands.
+#' criterion for each coalition are created, i.e., when we only average over the observations/explicands.
#' If `plot_type = "explicand"`, then the plots (one line plot and one bar plot) associated with the MSEv evaluation
-#' criterion for each observations/explicands are created, i.e., when we only average over the combinations/coalitions.
+#' criterion for each observations/explicands are created, i.e., when we only average over the coalitions.
#' If `plot_type` is a vector of one or several of "overall", "comb", and "explicand", then the associated plots are
#' created.
#'
@@ -862,7 +862,7 @@ make_waterfall_plot <- function(dt_plot,
#' x_train = x_train,
#' approach = "independence",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Gaussian 1e1 approach
@@ -872,7 +872,7 @@ make_waterfall_plot <- function(dt_plot,
#' x_train = x_train,
#' approach = "gaussian",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e1
+#' n_MC_samples = 1e1
#' )
#'
#' # Gaussian 1e2 approach
@@ -882,7 +882,7 @@ make_waterfall_plot <- function(dt_plot,
#' x_train = x_train,
#' approach = "gaussian",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # ctree approach
@@ -892,7 +892,7 @@ make_waterfall_plot <- function(dt_plot,
#' x_train = x_train,
#' approach = "ctree",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Combined approach
@@ -902,7 +902,7 @@ make_waterfall_plot <- function(dt_plot,
#' x_train = x_train,
#' approach = c("gaussian", "independence", "ctree"),
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Create a list of explanations with names
@@ -915,24 +915,24 @@ make_waterfall_plot <- function(dt_plot,
#' )
#'
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
-#' # Create the default MSEv plot where we average over both the combinations and observations
+#' # Create the default MSEv plot where we average over both the coalitions and observations
#' # with approximate 95% confidence intervals
#' plot_MSEv_eval_crit(explanation_list_named, CI_level = 0.95, plot_type = "overall")
#'
-#' # Can also create plots of the MSEv criterion averaged only over the combinations or observations.
+#' # Can also create plots of the MSEv criterion averaged only over the coalitions or observations.
#' MSEv_figures <- plot_MSEv_eval_crit(explanation_list_named,
#' CI_level = 0.95,
#' plot_type = c("overall", "comb", "explicand")
#' )
#' MSEv_figures$MSEv_bar
-#' MSEv_figures$MSEv_combination_bar
+#' MSEv_figures$MSEv_coalition_bar
#' MSEv_figures$MSEv_explicand_bar
#'
-#' # When there are many combinations or observations, then it can be easier to look at line plots
-#' MSEv_figures$MSEv_combination_line_point
+#' # When there are many coalitions or observations, then it can be easier to look at line plots
+#' MSEv_figures$MSEv_coalition_line_point
#' MSEv_figures$MSEv_explicand_line_point
#'
-#' # We can specify which observations or combinations to plot
+#' # We can specify which observations or coalitions to plot
#' plot_MSEv_eval_crit(explanation_list_named,
#' plot_type = "explicand",
#' index_x_explain = c(1, 3:4, 6),
@@ -940,9 +940,9 @@ make_waterfall_plot <- function(dt_plot,
#' )$MSEv_explicand_bar
#' plot_MSEv_eval_crit(explanation_list_named,
#' plot_type = "comb",
-#' id_combination = c(3, 4, 9, 13:15),
+#' id_coalition = c(3, 4, 9, 13:15),
#' CI_level = 0.95
-#' )$MSEv_combination_bar
+#' )$MSEv_coalition_bar
#'
#' # We can alter the figures if other palette schemes or design is wanted
#' bar_text_n_decimals <- 1
@@ -972,7 +972,7 @@ make_waterfall_plot <- function(dt_plot,
#' @author Lars Henry Berge Olsen
plot_MSEv_eval_crit <- function(explanation_list,
index_x_explain = NULL,
- id_combination = NULL,
+ id_coalition = NULL,
CI_level = if (length(explanation_list[[1]]$pred_explain) < 20) NULL else 0.95,
geom_col_width = 0.9,
plot_type = "overall") {
@@ -1004,20 +1004,22 @@ plot_MSEv_eval_crit <- function(explanation_list,
# Check that the explanation objects explain the same observations
MSEv_check_explanation_list(explanation_list)
- # Get the number of observations and combinations and the quantile of the T distribution
+ # Get the number of observations and coalitions and the quantile of the T distribution
+ iter <- length(explanation_list[[1]]$internal$iter_list)
+ n_coalitions <- explanation_list[[1]]$internal$iter_list[[iter]]$n_coalitions
+
n_explain <- explanation_list[[1]]$internal$parameters$n_explain
- n_combinations <- explanation_list[[1]]$internal$parameters$n_combinations
tfrac <- if (is.null(CI_level)) NULL else qt((1 + CI_level) / 2, n_explain - 1)
# Create data.tables of the MSEv values
MSEv_dt_list <- MSEv_extract_MSEv_values(
explanation_list = explanation_list,
index_x_explain = index_x_explain,
- id_combination = id_combination
+ id_coalition = id_coalition
)
MSEv_dt <- MSEv_dt_list$MSEv
MSEv_explicand_dt <- MSEv_dt_list$MSEv_explicand
- MSEv_combination_dt <- MSEv_dt_list$MSEv_combination
+ MSEv_coalition_dt <- MSEv_dt_list$MSEv_coalition
# Warnings related to the approximate confidence intervals
if (!is.null(CI_level)) {
@@ -1045,23 +1047,23 @@ plot_MSEv_eval_crit <- function(explanation_list,
return_object <- list()
if ("explicand" %in% plot_type) {
- # MSEv averaged over only the combinations for each observation
+ # MSEv averaged over only the coalitions for each observation
return_object <- c(
return_object,
make_MSEv_explicand_plots(
MSEv_explicand_dt = MSEv_explicand_dt,
- n_combinations = n_combinations,
+ n_coalitions = n_coalitions,
geom_col_width = geom_col_width
)
)
}
if ("comb" %in% plot_type) {
- # MSEv averaged over only the observations for each combinations
+ # MSEv averaged over only the observations for each coalitions
return_object <- c(
return_object,
- make_MSEv_combination_plots(
- MSEv_combination_dt = MSEv_combination_dt,
+ make_MSEv_coalition_plots(
+ MSEv_coalition_dt = MSEv_coalition_dt,
n_explain = n_explain,
geom_col_width = geom_col_width,
tfrac = tfrac
@@ -1070,10 +1072,10 @@ plot_MSEv_eval_crit <- function(explanation_list,
}
if ("overall" %in% plot_type) {
- # MSEv averaged over both the combinations and observations
+ # MSEv averaged over both the coalitions and observations
return_object$MSEv_bar <- make_MSEv_bar_plot(
MSEv_dt = MSEv_dt,
- n_combinations = n_combinations,
+ n_coalitions = n_coalitions,
n_explain = n_explain,
geom_col_width = geom_col_width,
tfrac = tfrac
@@ -1148,7 +1150,7 @@ MSEv_check_explanation_list <- function(explanation_list) {
))
}
- # Check that all explanation objects use the same combinations
+ # Check that all explanation objects use the same coalitions
entries_using_diff_combs <- sapply(explanation_list, function(explanation) {
!identical(explanation_list[[1]]$internal$objects$X$features, explanation$internal$objects$X$features)
})
@@ -1156,7 +1158,7 @@ MSEv_check_explanation_list <- function(explanation_list) {
methods_with_diff_comb_str <- paste(names(entries_using_diff_combs)[entries_using_diff_combs], collapse = "', '")
stop(paste0(
"The object/objects '", methods_with_diff_comb_str, "' in `explanation_list` uses/use different ",
- "coaltions than '", names(explanation_list)[1], "'. Cannot compare them."
+ "coalitions than '", names(explanation_list)[1], "'. Cannot compare them."
))
}
}
@@ -1165,9 +1167,9 @@ MSEv_check_explanation_list <- function(explanation_list) {
#' @author Lars Henry Berge Olsen
MSEv_extract_MSEv_values <- function(explanation_list,
index_x_explain = NULL,
- id_combination = NULL) {
- # Function that extract the MSEv values from the different explanations objects in ´explanation_list´,
- # put the values in data.tables, and keep only the desired observations and combinations.
+ id_coalition = NULL) {
+ # Function that extract the MSEv values from the different explanations objects in explanation_list,
+ # put the values in data.tables, and keep only the desired observations and coalitions.
# The overall MSEv criterion
MSEv <- rbindlist(lapply(explanation_list, function(explanation) explanation$MSEv$MSEv),
@@ -1182,27 +1184,27 @@ MSEv_extract_MSEv_values <- function(explanation_list,
MSEv_explicand$id <- factor(MSEv_explicand$id)
MSEv_explicand$Method <- factor(MSEv_explicand$Method, levels = names(explanation_list))
- # The MSEv evaluation criterion for each combination.
- MSEv_combination <- rbindlist(lapply(explanation_list, function(explanation) explanation$MSEv$MSEv_combination),
+ # The MSEv evaluation criterion for each coalition.
+ MSEv_coalition <- rbindlist(lapply(explanation_list, function(explanation) explanation$MSEv$MSEv_coalition),
use.names = TRUE, idcol = "Method"
)
- MSEv_combination$id_combination <- factor(MSEv_combination$id_combination)
- MSEv_combination$Method <- factor(MSEv_combination$Method, levels = names(explanation_list))
+ MSEv_coalition$id_coalition <- factor(MSEv_coalition$id_coalition)
+ MSEv_coalition$Method <- factor(MSEv_coalition$Method, levels = names(explanation_list))
- # Only keep the desired observations and combinations
+ # Only keep the desired observations and coalitions
if (!is.null(index_x_explain)) MSEv_explicand <- MSEv_explicand[id %in% index_x_explain]
- if (!is.null(id_combination)) {
- id_combination_aux <- id_combination
- MSEv_combination <- MSEv_combination[id_combination %in% id_combination_aux]
+ if (!is.null(id_coalition)) {
+ id_coalition_aux <- id_coalition
+ MSEv_coalition <- MSEv_coalition[id_coalition %in% id_coalition_aux]
}
- return(list(MSEv = MSEv, MSEv_explicand = MSEv_explicand, MSEv_combination = MSEv_combination))
+ return(list(MSEv = MSEv, MSEv_explicand = MSEv_explicand, MSEv_coalition = MSEv_coalition))
}
#' @keywords internal
#' @author Lars Henry Berge Olsen
make_MSEv_bar_plot <- function(MSEv_dt,
- n_combinations,
+ n_coalitions,
n_explain,
tfrac = NULL,
geom_col_width = 0.9) {
@@ -1215,16 +1217,16 @@ make_MSEv_bar_plot <- function(MSEv_dt,
ggplot2::labs(
x = "Method",
y = bquote(MSE[v]),
- title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_combinations) ~
- "combinations and" ~ .(n_explain) ~ "explicands")
+ title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_coalitions) ~
+ "coalitions and" ~ .(n_explain) ~ "explicands")
)
if (!is.null(tfrac)) {
CI_level <- 1 - 2 * (1 - pt(tfrac, n_explain - 1))
MSEv_bar <- MSEv_bar +
- ggplot2::labs(title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_combinations) ~
- "combinations and" ~ .(n_explain) ~ "explicands with" ~
+ ggplot2::labs(title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_coalitions) ~
+ "coalitions and" ~ .(n_explain) ~ "explicands with" ~
.(CI_level * 100) * "% CI")) +
ggplot2::geom_errorbar(
position = ggplot2::position_dodge(geom_col_width),
@@ -1243,15 +1245,15 @@ make_MSEv_bar_plot <- function(MSEv_dt,
#' @keywords internal
#' @author Lars Henry Berge Olsen
make_MSEv_explicand_plots <- function(MSEv_explicand_dt,
- n_combinations,
+ n_coalitions,
geom_col_width = 0.9) {
MSEv_explicand_source <-
ggplot2::ggplot(MSEv_explicand_dt, ggplot2::aes(x = id, y = MSEv)) +
ggplot2::labs(
x = "index_x_explain",
y = bquote(MSE[v] ~ "(explicand)"),
- title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_combinations) ~
- "combinations for each explicand")
+ title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_coalitions) ~
+ "coalitions for each explicand")
)
MSEv_explicand_bar <-
@@ -1277,21 +1279,21 @@ make_MSEv_explicand_plots <- function(MSEv_explicand_dt,
#' @keywords internal
#' @author Lars Henry Berge Olsen
-make_MSEv_combination_plots <- function(MSEv_combination_dt,
- n_explain,
- tfrac = NULL,
- geom_col_width = 0.9) {
- MSEv_combination_source <-
- ggplot2::ggplot(MSEv_combination_dt, ggplot2::aes(x = id_combination, y = MSEv)) +
+make_MSEv_coalition_plots <- function(MSEv_coalition_dt,
+ n_explain,
+ tfrac = NULL,
+ geom_col_width = 0.9) {
+ MSEv_coalition_source <-
+ ggplot2::ggplot(MSEv_coalition_dt, ggplot2::aes(x = id_coalition, y = MSEv)) +
ggplot2::labs(
- x = "id_combination",
- y = bquote(MSE[v] ~ "(combination)"),
+ x = "id_coalition",
+ y = bquote(MSE[v] ~ "(coalition)"),
title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_explain) ~
- "explicands for each combination")
+ "explicands for each coalition")
)
- MSEv_combination_bar <-
- MSEv_combination_source +
+ MSEv_coalition_bar <-
+ MSEv_coalition_source +
ggplot2::geom_col(
width = geom_col_width,
position = ggplot2::position_dodge(geom_col_width),
@@ -1301,10 +1303,10 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
if (!is.null(tfrac)) {
CI_level <- 1 - 2 * (1 - pt(tfrac, n_explain - 1))
- MSEv_combination_bar <-
- MSEv_combination_bar +
+ MSEv_coalition_bar <-
+ MSEv_coalition_bar +
ggplot2::labs(title = bquote(MSE[v] ~ "criterion averaged over the" ~ .(n_explain) ~
- "explicands for each combination with" ~ .(CI_level * 100) * "% CI")) +
+ "explicands for each coalition with" ~ .(CI_level * 100) * "% CI")) +
ggplot2::geom_errorbar(
position = ggplot2::position_dodge(geom_col_width),
width = 0.25,
@@ -1316,16 +1318,16 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
)
}
- MSEv_combination_line_point <-
- MSEv_combination_source +
- ggplot2::aes(x = as.numeric(id_combination)) +
- ggplot2::labs(x = "id_combination") +
+ MSEv_coalition_line_point <-
+ MSEv_coalition_source +
+ ggplot2::aes(x = as.numeric(id_coalition)) +
+ ggplot2::labs(x = "id_coalition") +
ggplot2::geom_point(ggplot2::aes(col = Method)) +
ggplot2::geom_line(ggplot2::aes(group = Method, col = Method))
return(list(
- MSEv_combination_bar = MSEv_combination_bar,
- MSEv_combination_line_point = MSEv_combination_line_point
+ MSEv_coalition_bar = MSEv_coalition_bar,
+ MSEv_coalition_line_point = MSEv_coalition_line_point
))
}
@@ -1409,7 +1411,7 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
#' x_train = x_train,
#' approach = "independence",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Empirical approach
@@ -1419,7 +1421,7 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
#' x_train = x_train,
#' approach = "empirical",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Gaussian 1e1 approach
@@ -1429,7 +1431,7 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
#' x_train = x_train,
#' approach = "gaussian",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e1
+#' n_MC_samples = 1e1
#' )
#'
#' # Gaussian 1e2 approach
@@ -1439,7 +1441,7 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
#' x_train = x_train,
#' approach = "gaussian",
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Combined approach
@@ -1449,7 +1451,7 @@ make_MSEv_combination_plots <- function(MSEv_combination_dt,
#' x_train = x_train,
#' approach = c("gaussian", "ctree", "empirical"),
#' prediction_zero = prediction_zero,
-#' n_samples = 1e2
+#' n_MC_samples = 1e2
#' )
#'
#' # Create a list of explanations with names
@@ -1647,7 +1649,7 @@ update_only_these_features <- function(explanation_list,
# Update the `only_these_features` parameter vector based on `plot_phi0` or in case it is NULL
# Get the common feature names for all explanation objects (including `none`) and one without `none`
- feature_names_with_none <- colnames(explanation_list[[1]]$shapley_values)
+ feature_names_with_none <- colnames(explanation_list[[1]]$shapley_values)[-1]
feature_names_without_none <- feature_names_with_none[feature_names_with_none != "none"]
# Only keep the desired features/columns
diff --git a/R/prepare_next_iteration.R b/R/prepare_next_iteration.R
new file mode 100644
index 000000000..7800db73f
--- /dev/null
+++ b/R/prepare_next_iteration.R
@@ -0,0 +1,80 @@
+#' Prepares the next iteration of the adaptive sampling algorithm
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+prepare_next_iteration <- function(internal) {
+ iter <- length(internal$iter_list)
+ converged <- internal$iter_list[[iter]]$converged
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+
+
+ if (converged == FALSE) {
+ next_iter_list <- list()
+
+ n_shapley_values <- internal$parameters$n_shapley_values
+ reduction_factor_vec <- internal$parameters$adaptive_arguments$reduction_factor_vec
+ fixed_n_coalitions_per_iter <- internal$parameters$adaptive_arguments$fixed_n_coalitions_per_iter
+ max_n_coalitions <- internal$parameters$adaptive_arguments$max_n_coalitions
+
+
+ est_remaining_coalitions <- internal$iter_list[[iter]]$est_remaining_coalitions
+ reduction_factor <- internal$iter_list[[iter]]$reduction_factor
+ current_n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ current_coal_samples <- internal$iter_list[[iter]]$coal_samples
+
+ if (is.null(fixed_n_coalitions_per_iter)) {
+ proposal_next_n_coalitions <- current_n_coalitions + ceiling(est_remaining_coalitions * reduction_factor)
+ } else {
+ proposal_next_n_coalitions <- current_n_coalitions + fixed_n_coalitions_per_iter
+ }
+
+ # Thresholding if max_n_coalitions is reached
+ proposal_next_n_coalitions <- min(
+ max_n_coalitions,
+ proposal_next_n_coalitions
+ )
+
+ if (paired_shap_sampling) {
+ proposal_next_n_coalitions <- ceiling(proposal_next_n_coalitions * 0.5) * 2
+ }
+
+
+ if ((proposal_next_n_coalitions) >= 2^n_shapley_values) {
+ # Use all coalitions in the last iteration as the estimated number of samples is more than what remains
+ next_iter_list$exact <- TRUE
+ next_iter_list$n_coalitions <- 2^n_shapley_values
+ next_iter_list$compute_sd <- FALSE
+ } else {
+ # Sample more keeping the current samples
+ next_iter_list$exact <- FALSE
+ next_iter_list$n_coalitions <- proposal_next_n_coalitions
+ next_iter_list$compute_sd <- TRUE
+ }
+
+ if (!is.null(reduction_factor_vec[1])) {
+ next_iter_list$reduction_factor <- ifelse(
+ length(reduction_factor_vec) >= iter,
+ reduction_factor_vec[iter],
+ reduction_factor_vec[length(reduction_factor_vec)]
+ )
+ } else {
+ next_iter_list$reduction_factor <- NULL
+ }
+
+ next_iter_list$new_n_coalitions <- next_iter_list$n_coalitions - current_n_coalitions
+
+ next_iter_list$n_batches <- set_n_batches(next_iter_list$new_n_coalitions, internal)
+
+
+ next_iter_list$prev_coal_samples <- current_coal_samples
+ } else {
+ next_iter_list <- list()
+ }
+
+ internal$iter_list[[iter + 1]] <- next_iter_list
+
+
+ return(internal)
+}
diff --git a/R/print_iter.R b/R/print_iter.R
new file mode 100644
index 000000000..6c1a3491b
--- /dev/null
+++ b/R/print_iter.R
@@ -0,0 +1,109 @@
+#' Prints iterative information
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+print_iter <- function(internal) {
+ verbose <- internal$parameters$verbose
+ iter <- length(internal$iter_list) - 1 # This function is called after the preparation of the next iteration
+
+ converged <- internal$iter_list[[iter]]$converged
+ converged_exact <- internal$iter_list[[iter]]$converged_exact
+ converged_sd <- internal$iter_list[[iter]]$converged_sd
+ converged_max_iter <- internal$iter_list[[iter]]$converged_max_iter
+ converged_max_n_coalitions <- internal$iter_list[[iter]]$converged_max_n_coalitions
+ overall_conv_measure <- internal$iter_list[[iter]]$overall_conv_measure
+ reduction_factor <- internal$iter_list[[iter]]$reduction_factor
+
+ saving_path <- internal$parameters$adaptive_arguments$saving_path
+ convergence_tolerance <- internal$parameters$adaptive_arguments$convergence_tolerance
+ testing <- internal$parameters$testing
+
+ if ("convergence" %in% verbose) {
+ convergence_tolerance <- internal$parameters$adaptive_arguments$convergence_tolerance
+
+ current_n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ est_remaining_coalitions <- internal$iter_list[[iter]]$est_remaining_coalitions
+ est_required_coalitions <- internal$iter_list[[iter]]$est_required_coalitions
+
+ next_n_coalitions <- internal$iter_list[[iter + 1]]$n_coalitions
+ next_new_n_coalitions <- internal$iter_list[[iter + 1]]$new_n_coalitions
+
+ cli::cli_h3("Convergence info")
+
+ if (isFALSE(converged)) {
+ msg <- "Not converged after {current_n_coalitions} coalitions:\n"
+
+ if (!is.null(convergence_tolerance)) {
+ conv_nice <- signif(overall_conv_measure, 2)
+ tol_nice <- format(signif(convergence_tolerance, 2), scientific = FALSE)
+ reduction_factor_nice <- format(signif(reduction_factor * 100, 2), scientific = FALSE)
+ msg <- paste0(
+ msg,
+ "Current convergence measure: {conv_nice} [needs {tol_nice}]\n",
+ "Estimated remaining coalitions: {est_remaining_coalitions}\n",
+ "(Concervatively) adding {reduction_factor_nice}% of that ({next_new_n_coalitions} coalitions) ",
+ "in the next iteration."
+ )
+ }
+ cli::cli_alert_info(msg)
+ } else {
+ msg <- "Converged after {current_n_coalitions} coalitions:\n"
+ if (isTRUE(converged_exact)) {
+ msg <- paste0(
+ msg,
+ "All ({current_n_coalitions}) coalitions used.\n"
+ )
+ }
+ if (isTRUE(converged_sd)) {
+ msg <- paste0(
+ msg,
+ "Convergence tolerance reached!\n"
+ )
+ }
+ if (isTRUE(converged_max_iter)) {
+ msg <- paste0(
+ msg,
+ "Maximum number of iterations reached!\n"
+ )
+ }
+ if (isTRUE(converged_max_n_coalitions)) {
+ msg <- paste0(
+ msg,
+ "Maximum number of coalitions reached!\n"
+ )
+ }
+ cli::cli_alert_success(msg)
+ }
+ }
+
+ if ("shapley" %in% verbose) {
+ n_explain <- internal$parameters$n_explain
+
+ dt_shapley_est <- internal$iter_list[[iter]]$dt_shapley_est[, -1]
+ dt_shapley_sd <- internal$iter_list[[iter]]$dt_shapley_sd[, -1]
+
+ # Printing the current Shapley values
+ matrix1 <- format(round(dt_shapley_est, 3), nsmall = 2, justify = "right")
+ matrix2 <- format(round(dt_shapley_sd, 2), nsmall = 2, justify = "right")
+
+ if (isTRUE(converged)) {
+ msg <- "Final "
+ } else {
+ msg <- "Current "
+ }
+
+ if (converged_exact) {
+ msg <- paste0(msg, "estimated Shapley values")
+ print_dt <- as.data.table(matrix1)
+ } else {
+ msg <- paste0(msg, "estimated Shapley values (sd)")
+ print_dt <- as.data.table(matrix(paste(matrix1, " (", matrix2, ") ", sep = ""), nrow = n_explain))
+ }
+
+ cli::cli_h3(msg)
+ names(print_dt) <- names(dt_shapley_est)
+ print(print_dt)
+ }
+}
diff --git a/R/save_results.R b/R/save_results.R
new file mode 100644
index 000000000..ca48c0098
--- /dev/null
+++ b/R/save_results.R
@@ -0,0 +1,22 @@
+#' Saves the itermediate results to disk
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+save_results <- function(internal) {
+ saving_path <- internal$parameters$adaptive_arguments$saving_path
+
+ # Modify name for the new file
+ filename <- basename(saving_path)
+ dirname <- dirname(saving_path)
+ filename_copy <- paste0("new_", filename)
+ saving_path_copy <- file.path(dirname, filename_copy)
+
+ # Save the results to a new location, then delete old and rename for safe code interruption
+
+ # Saving parameters and iter_list
+ saveRDS(internal[c("parameters", "iter_list")], saving_path_copy)
+ if (file.exists(saving_path)) file.remove(saving_path)
+ file.rename(saving_path_copy, saving_path)
+}
diff --git a/R/setup.R b/R/setup.R
index 5f2f2b548..96788227f 100644
--- a/R/setup.R
+++ b/R/setup.R
@@ -16,16 +16,22 @@
#' @param is_python Logical. Indicates whether the function is called from the Python wrapper. Default is FALSE which is
#' never changed when calling the function via `explain()` in R. The parameter is later used to disallow
#' running the AICc-versions of the empirical as that requires data based optimization.
+#' @param testing Logical.
+#' Only use to remove random components like timing from the object output when comparing output with testthat.
+#' Defaults to `FALSE`.
+#' @param init_time POSIXct object.
+#' The time when the `explain()` function was called, as outputted by `Sys.time()`.
+#' Used to calculate the time it took to run the full `explain` call.
#' @export
setup <- function(x_train,
x_explain,
approach,
+ paired_shap_sampling = FALSE,
prediction_zero,
output_size = 1,
- n_combinations,
+ max_n_coalitions,
group,
- n_samples,
- n_batches,
+ n_MC_samples,
seed,
keep_samp_for_vS,
feature_specs,
@@ -39,20 +45,39 @@ setup <- function(x_train,
explain_y_lags = NULL,
explain_xreg_lags = NULL,
group_lags = NULL,
- timing,
verbose,
+ adaptive = NULL,
+ adaptive_arguments = list(),
+ shapley_reweighting = "none",
is_python = FALSE,
+ testing = FALSE,
+ init_time = NULL,
+ prev_shapr_object = NULL,
...) {
internal <- list()
+ # Using parameters and iter_list from a previouys to continue estimation from on previous shapr objects
+ if (is.null(prev_shapr_object)) {
+ prev_iter_list <- NULL
+ } else {
+ prev_internal <- get_prev_internal(prev_shapr_object)
+
+ prev_iter_list <- prev_internal$iter_list
+
+ # Overwrite the input arguments set in explain() with those from in prev_shapr_object
+ # except model, x_explain, x_train, max_n_coalitions, adaptive_arguments, seed
+ list2env(prev_internal$parameters)
+ }
+
+
internal$parameters <- get_parameters(
approach = approach,
+ paired_shap_sampling = paired_shap_sampling,
prediction_zero = prediction_zero,
output_size = output_size,
- n_combinations = n_combinations,
+ max_n_coalitions = max_n_coalitions,
group = group,
- n_samples = n_samples,
- n_batches = n_batches,
+ n_MC_samples = n_MC_samples,
seed = seed,
keep_samp_for_vS = keep_samp_for_vS,
type = type,
@@ -63,9 +88,12 @@ setup <- function(x_train,
explain_xreg_lags = explain_xreg_lags,
group_lags = group_lags,
MSEv_uniform_comb_weights = MSEv_uniform_comb_weights,
- timing = timing,
verbose = verbose,
+ adaptive = adaptive,
+ adaptive_arguments = adaptive_arguments,
+ shapley_reweighting = shapley_reweighting,
is_python = is_python,
+ testing = testing,
...
)
@@ -92,148 +120,267 @@ setup <- function(x_train,
internal <- check_and_set_parameters(internal)
+ internal <- set_adaptive_parameters(internal, prev_iter_list)
+
+ internal$timing_list <- list(
+ init_time = init_time,
+ setup = Sys.time()
+ )
+
return(internal)
}
-#' @keywords internal
-check_and_set_parameters <- function(internal) {
- # Check groups
- feature_names <- internal$parameters$feature_names
- group <- internal$parameters$group
- n_combinations <- internal$parameters$n_combinations
- n_features <- internal$parameters$n_features
- n_groups <- internal$parameters$n_groups
- is_groupwise <- internal$parameters$is_groupwise
- exact <- internal$parameters$exact
-
- if (!is.null(group)) check_groups(feature_names, group)
+get_prev_internal <- function(prev_shapr_object,
+ exclude_parameters = c("max_n_coalitions", "adaptive_arguments", "seed")) {
+ cl <- class(prev_shapr_object)[1]
- if (exact) {
- internal$parameters$used_n_combinations <- if (is_groupwise) 2^n_groups else 2^n_features
+ if (cl == "character") {
+ internal <- readRDS(file = prev_shapr_object) # Already contains only "parameters" and "iter_list"
+ } else if (cl == "shapr") {
+ internal <- prev_shapr_object$internal[c("parameters", "iter_list")]
} else {
- internal$parameters$used_n_combinations <-
- if (is_groupwise) min(2^n_groups, n_combinations) else min(2^n_features, n_combinations)
- check_n_combinations(internal)
+ stop("Invalid `shapr_object` passed to explain(). See ?explain for details.")
}
- # Check approach
- check_approach(internal)
-
- # Setting default value for n_batches (when NULL)
- internal <- set_defaults(internal)
+ if (length(exclude_parameters) > 0) {
+ internal$parameters[exclude_parameters] <- NULL
+ }
- # Checking n_batches vs n_combinations etc
- check_n_batches(internal)
+ iter <- length(internal$iter_list)
+ internal$iter_list[[iter]]$converged <- FALSE # hard setting the convergence parameter
- # Check regression if we are doing regression
- if (internal$parameters$regression) internal <- regression.check(internal)
return(internal)
}
+
#' @keywords internal
-#' @author Lars Henry Berge Olsen
-regression.check <- function(internal) {
- # Check that the model outputs one-dimensional predictions
- if (internal$parameters$output_size != 1) {
- stop("`regression_separate` and `regression_surrogate` only support models with one-dimensional output")
+get_parameters <- function(approach,
+ paired_shap_sampling,
+ prediction_zero,
+ output_size = 1,
+ max_n_coalitions,
+ group,
+ n_MC_samples,
+ seed,
+ keep_samp_for_vS,
+ type,
+ horizon,
+ train_idx,
+ explain_idx,
+ explain_y_lags,
+ explain_xreg_lags,
+ group_lags = NULL,
+ MSEv_uniform_comb_weights,
+ verbose = "basic",
+ adaptive = FALSE,
+ adaptive_arguments = list(),
+ shapley_reweighting = "none",
+ testing, is_python, ...) {
+ # Check input type for approach
+
+ # approach is checked more comprehensively later
+ if (!is.logical(paired_shap_sampling) && length(paired_shap_sampling) == 1) {
+ stop("`paired_shap_sampling` must be a single logical.")
}
- # Check that we are NOT explaining a forecast model
- if (internal$parameters$type == "forecast") {
- stop("`regression_separate` and `regression_surrogate` does not support `forecast`.")
+ if (!is.logical(adaptive) && length(adaptive) == 1) {
+ stop("`adaptive` must be a single logical.")
+ }
+ if (!is.list(adaptive_arguments)) {
+ stop("`adaptive_arguments` must be a list.")
}
- # Check that we are not to keep the Monte Carlo samples
- if (internal$parameters$keep_samp_for_vS) {
- stop(paste(
- "`keep_samp_for_vS` must be `FALSE` for the `regression_separate` and `regression_surrogate`",
- "approaches as there are no Monte Carlo samples to keep for these approaches."
- ))
+
+ # max_n_coalitions
+ if (!is.null(max_n_coalitions) &&
+ !(is.wholenumber(max_n_coalitions) &&
+ length(max_n_coalitions) == 1 &&
+ !is.na(max_n_coalitions) &&
+ max_n_coalitions > 0)) {
+ stop("`max_n_coalitions` must be NULL or a single positive integer.")
}
- # Remove n_samples if we are doing regression, as we are not doing MC sampling
- internal$parameters$n_samples <- NULL
+ # group (checked more thoroughly later)
+ if (!is.null(group) &&
+ !is.list(group)) {
+ stop("`group` must be NULL or a list")
+ }
- return(internal)
-}
+ # n_MC_samples
+ if (!(is.wholenumber(n_MC_samples) &&
+ length(n_MC_samples) == 1 &&
+ !is.na(n_MC_samples) &&
+ n_MC_samples > 0)) {
+ stop("`n_MC_samples` must be a single positive integer.")
+ }
-#' @keywords internal
-check_n_combinations <- function(internal) {
- is_groupwise <- internal$parameters$is_groupwise
- n_combinations <- internal$parameters$n_combinations
- n_features <- internal$parameters$n_features
- n_groups <- internal$parameters$n_groups
+ # keep_samp_for_vS
+ if (!(is.logical(keep_samp_for_vS) &&
+ length(keep_samp_for_vS) == 1)) {
+ stop("`keep_samp_for_vS` must be single logical.")
+ }
- type <- internal$parameters$type
+ # type
+ if (!(type %in% c("normal", "forecast"))) {
+ stop("`type` must be either `normal` or `forecast`.\n")
+ }
+
+ # verbose
+ check_verbose(verbose)
+ if (!is.null(verbose) &&
+ (!is.character(verbose) || !(all(verbose %in% c("basic", "progress", "convergence", "shapley", "vS_details"))))
+ ) {
+ stop(
+ paste0(
+ "`verbose` must be NULL or a string (vector) containing one or more of the strings ",
+ "`basic`, `progress`, `convergence`, `shapley`, `vS_details`.\n"
+ )
+ )
+ }
+ # parameters only used for type "forecast"
if (type == "forecast") {
- horizon <- internal$parameters$horizon
- explain_y_lags <- internal$parameters$explain_lags$y
- explain_xreg_lags <- internal$parameters$explain_lags$xreg
- xreg <- internal$data$xreg
+ if (!(is.wholenumber(horizon) && all(horizon > 0))) {
+ stop("`horizon` must be a vector (or scalar) of positive integers.\n")
+ }
- if (!is_groupwise) {
- if (n_combinations <= n_features) {
- stop(paste0(
- "`n_combinations` (", n_combinations, ") has to be greater than the number of components to decompose ",
- " the forecast onto:\n",
- "`horizon` (", horizon, ") + `explain_y_lags` (", explain_y_lags, ") ",
- "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ").\n"
- ))
- }
- } else {
- if (n_combinations <= n_groups) {
- stop(paste0(
- "`n_combinations` (", n_combinations, ") has to be greater than the number of components to decompose ",
- "the forecast onto:\n",
- "ncol(`xreg`) (", ncol(`xreg`), ") + 1"
- ))
- }
+ if (any(horizon != output_size)) {
+ stop(paste0("`horizon` must match the output size of the model (", paste0(output_size, collapse = ", "), ").\n"))
}
- } else {
- if (!is_groupwise) {
- if (n_combinations <= n_features) stop("`n_combinations` has to be greater than the number of features.")
- } else {
- if (n_combinations <= n_groups) stop("`n_combinations` has to be greater than the number of groups.")
+
+ if (!(length(train_idx) > 1 && is.wholenumber(train_idx) && all(train_idx > 0) && all(is.finite(train_idx)))) {
+ stop("`train_idx` must be a vector of positive finite integers and length > 1.\n")
}
- }
-}
+ if (!(is.wholenumber(explain_idx) && all(explain_idx > 0) && all(is.finite(explain_idx)))) {
+ stop("`explain_idx` must be a vector of positive finite integers.\n")
+ }
+ if (!(is.wholenumber(explain_y_lags) && all(explain_y_lags >= 0) && all(is.finite(explain_y_lags)))) {
+ stop("`explain_y_lags` must be a vector of positive finite integers.\n")
+ }
-#' @keywords internal
-check_n_batches <- function(internal) {
- n_batches <- internal$parameters$n_batches
- n_features <- internal$parameters$n_features
- n_combinations <- internal$parameters$n_combinations
- is_groupwise <- internal$parameters$is_groupwise
- n_groups <- internal$parameters$n_groups
- n_unique_approaches <- internal$parameters$n_unique_approaches
+ if (!(is.wholenumber(explain_xreg_lags) && all(explain_xreg_lags >= 0) && all(is.finite(explain_xreg_lags)))) {
+ stop("`explain_xreg_lags` must be a vector of positive finite integers.\n")
+ }
- if (!is_groupwise) {
- actual_n_combinations <- ifelse(is.null(n_combinations), 2^n_features, n_combinations)
- } else {
- actual_n_combinations <- ifelse(is.null(n_combinations), 2^n_groups, n_combinations)
+ if (!(is.logical(group_lags) && length(group_lags) == 1)) {
+ stop("`group_lags` must be a single logical.\n")
+ }
}
- if (n_batches >= actual_n_combinations) {
- stop(paste0(
- "`n_batches` (", n_batches, ") must be smaller than the number of feature combinations/`n_combinations` (",
- actual_n_combinations, ")"
- ))
+ # Parameter used in the MSEv evaluation criterion
+ if (!(is.logical(MSEv_uniform_comb_weights) && length(MSEv_uniform_comb_weights) == 1)) {
+ stop("`MSEv_uniform_comb_weights` must be single logical.")
}
- if (n_batches < n_unique_approaches) {
+ #### Tests combining more than one parameter ####
+ # prediction_zero vs output_size
+ if (!all((is.numeric(prediction_zero)) &&
+ all(length(prediction_zero) == output_size) &&
+ all(!is.na(prediction_zero)))) {
stop(paste0(
- "`n_batches` (", n_batches, ") must be larger than the number of unique approaches in `approach` (",
- n_unique_approaches, ")."
+ "`prediction_zero` (", paste0(prediction_zero, collapse = ", "),
+ ") must be numeric and match the output size of the model (",
+ paste0(output_size, collapse = ", "), ")."
))
}
+
+ # type
+ if (!(length(shapley_reweighting) == 1 && shapley_reweighting %in%
+ c("none", "on_N", "on_coal_size", "on_all", "on_N_sum", "on_all_cond", "on_all_cond_paired", "comb"))) {
+ stop(
+ "`shapley_reweighting` must be one of `none`, `on_N`, `on_coal_size`, `on_N_sum`, ",
+ "`on_all`, `on_all_cond`, `on_all_cond_paired` or `comb`.\n"
+ )
+ }
+
+
+ # Getting basic input parameters
+ parameters <- list(
+ approach = approach,
+ paired_shap_sampling = paired_shap_sampling,
+ prediction_zero = prediction_zero,
+ max_n_coalitions = max_n_coalitions,
+ group = group,
+ n_MC_samples = n_MC_samples,
+ seed = seed,
+ keep_samp_for_vS = keep_samp_for_vS,
+ is_python = is_python,
+ output_size = output_size,
+ type = type,
+ horizon = horizon,
+ group_lags = group_lags,
+ MSEv_uniform_comb_weights = MSEv_uniform_comb_weights,
+ verbose = verbose,
+ shapley_reweighting = shapley_reweighting,
+ adaptive = adaptive,
+ adaptive_arguments = adaptive_arguments,
+ testing = testing
+ )
+
+ # Getting additional parameters from ...
+ parameters <- append(parameters, list(...))
+
+ # Setting that we are using regression based the approach name (any in case several approaches)
+ parameters$regression <- any(grepl("regression", parameters$approach))
+
+ return(parameters)
+}
+
+#' Function that checks the verbose parameter
+#'
+#' @inheritParams explain
+#'
+#' @return The function does not return anything.
+#'
+#' @keywords internal
+#' @author Lars Henry Berge Olsen, Martin Jullum
+check_verbose <- function(verbose) {
+ if (!is.null(verbose) &&
+ (!is.character(verbose) || !(all(verbose %in% c("basic", "progress", "convergence", "shapley", "vS_details"))))
+ ) {
+ stop(
+ paste0(
+ "`verbose` must be NULL or a string (vector) containing one or more of the strings ",
+ "`basic`, `progress`, `convergence`, `shapley`, `vS_details`.\n"
+ )
+ )
+ }
}
+#' @keywords internal
+get_data <- function(x_train, x_explain) {
+ # Check data object type
+ stop_message <- ""
+ if (!is.matrix(x_train) && !is.data.frame(x_train)) {
+ stop_message <- paste0(stop_message, "x_train should be a matrix or a data.frame/data.table.\n")
+ }
+ if (!is.matrix(x_explain) && !is.data.frame(x_explain)) {
+ stop_message <- paste0(stop_message, "x_explain should be a matrix or a data.frame/data.table.\n")
+ }
+ if (stop_message != "") {
+ stop(stop_message)
+ }
+
+ # Check column names
+ if (all(is.null(colnames(x_train)))) {
+ stop_message <- paste0(stop_message, "x_train misses column names.\n")
+ }
+ if (all(is.null(colnames(x_explain)))) {
+ stop_message <- paste0(stop_message, "x_explain misses column names.\n")
+ }
+ if (stop_message != "") {
+ stop(stop_message)
+ }
+ data <- list(
+ x_train = data.table::as.data.table(x_train),
+ x_explain = data.table::as.data.table(x_explain)
+ )
+}
#' @keywords internal
@@ -292,27 +439,6 @@ check_data <- function(internal) {
compare_feature_specs(x_train_feature_specs, x_explain_feature_specs, "x_train", "x_explain")
}
-compare_vecs <- function(vec1, vec2, vec_type, name1, name2) {
- if (!identical(vec1, vec2)) {
- if (is.null(names(vec1))) {
- text_vec1 <- paste(vec1, collapse = ", ")
- } else {
- text_vec1 <- paste(names(vec1), vec1, sep = ": ", collapse = ", ")
- }
- if (is.null(names(vec2))) {
- text_vec2 <- paste(vec2, collapse = ", ")
- } else {
- text_vec2 <- paste(names(vec2), vec1, sep = ": ", collapse = ", ")
- }
-
- stop(paste0(
- "Feature ", vec_type, " are not identical for ", name1, " and ", name2, ".\n",
- name1, " provided: ", text_vec1, ",\n",
- name2, " provided: ", text_vec2, ".\n"
- ))
- }
-}
-
compare_feature_specs <- function(spec1, spec2, name1 = "model", name2 = "x_train", sort_labels = FALSE) {
if (sort_labels) {
compare_vecs(sort(spec1$labels), sort(spec2$labels), "names", name1, name2)
@@ -334,7 +460,6 @@ compare_feature_specs <- function(spec1, spec2, name1 = "model", name2 = "x_trai
}
}
-
#' This includes both extra parameters and other objects
#' @keywords internal
get_extra_parameters <- function(internal) {
@@ -361,18 +486,29 @@ get_extra_parameters <- function(internal) {
"\nSuccess with message:\n
Group names not provided. Assigning them the default names 'group1', 'group2', 'group3' etc."
)
- names(internal$parameters$group) <- paste0("group", seq_along(group))
+ names(group) <- paste0("group", seq_along(group))
}
# Make group list with numeric feature indicators
- internal$objects$group_num <- lapply(group, FUN = function(x) {
+ internal$objects$coal_feature_list <- lapply(group, FUN = function(x) {
match(x, internal$parameters$feature_names)
})
+
+
+
internal$parameters$n_groups <- length(group)
+ internal$parameters$group_names <- names(group)
+ internal$parameters$group <- group
+ internal$parameters$shap_names <- internal$parameters$group_names
+ internal$parameters$n_shapley_values <- internal$parameters$n_groups
} else {
- internal$objects$group_num <- NULL
+ internal$objects$coal_feature_list <- as.list(seq_len(internal$parameters$n_features))
+
internal$parameters$n_groups <- NULL
+ internal$parameters$group_names <- NULL
+ internal$parameters$shap_names <- internal$parameters$feature_names
+ internal$parameters$n_shapley_values <- internal$parameters$n_features
}
# Get the number of unique approaches
@@ -382,226 +518,402 @@ get_extra_parameters <- function(internal) {
return(internal)
}
+#' Fetches feature information from a given data set
+#'
+#' @param x matrix, data.frame or data.table The data to extract feature information from.
+#'
+#' @details This function is used to extract the feature information to be checked against the corresponding
+#' information extracted from the model and other data sets. The function is called from internally
+#'
+#' @return A list with the following elements:
+#' \describe{
+#' \item{labels}{character vector with the feature names to compute Shapley values for}
+#' \item{classes}{a named character vector with the labels as names and the class types as elements}
+#' \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements
+#' (NULL if the feature is not a factor)}
+#' }
+#' @author Martin Jullum
+#'
#' @keywords internal
-get_parameters <- function(approach, prediction_zero, output_size = 1, n_combinations, group, n_samples,
- n_batches, seed, keep_samp_for_vS, type, horizon, train_idx, explain_idx, explain_y_lags,
- explain_xreg_lags, group_lags = NULL, MSEv_uniform_comb_weights, timing, verbose,
- is_python, ...) {
- # Check input type for approach
-
- # approach is checked more comprehensively later
-
- # n_combinations
- if (!is.null(n_combinations) &&
- !(is.wholenumber(n_combinations) &&
- length(n_combinations) == 1 &&
- !is.na(n_combinations) &&
- n_combinations > 0)) {
- stop("`n_combinations` must be NULL or a single positive integer.")
- }
-
- # group (checked more thoroughly later)
- if (!is.null(group) &&
- !is.list(group)) {
- stop("`group` must be NULL or a list")
- }
+#' @export
+#'
+#' @examples
+#' # Load example data
+#' data("airquality")
+#' airquality <- airquality[complete.cases(airquality), ]
+#' # Split data into test- and training data
+#' x_train <- head(airquality, -3)
+#' x_explain <- tail(airquality, 3)
+#' # Split data into test- and training data
+#' x_train <- data.table::as.data.table(head(airquality))
+#' x_train[, Temp := as.factor(Temp)]
+#' get_data_specs(x_train)
+get_data_specs <- function(x) {
+ feature_specs <- list()
+ feature_specs$labels <- names(x)
+ feature_specs$classes <- unlist(lapply(x, class))
+ feature_specs$factor_levels <- lapply(x, levels)
- # n_samples
- if (!(is.wholenumber(n_samples) &&
- length(n_samples) == 1 &&
- !is.na(n_samples) &&
- n_samples > 0)) {
- stop("`n_samples` must be a single positive integer.")
- }
- # n_batches
- if (!is.null(n_batches) &&
- !(is.wholenumber(n_batches) &&
- length(n_batches) == 1 &&
- !is.na(n_batches) &&
- n_batches > 0)) {
- stop("`n_batches` must be NULL or a single positive integer.")
- }
+ # Defining all integer values as numeric
+ feature_specs$classes[feature_specs$classes == "integer"] <- "numeric"
- # seed is already set, so we know it works
- # keep_samp_for_vS
- if (!(is.logical(timing) &&
- length(timing) == 1)) {
- stop("`timing` must be single logical.")
- }
+ return(feature_specs)
+}
- # keep_samp_for_vS
- if (!(is.logical(keep_samp_for_vS) &&
- length(keep_samp_for_vS) == 1)) {
- stop("`keep_samp_for_vS` must be single logical.")
- }
- # type
- if (!(type %in% c("normal", "forecast"))) {
- stop("`type` must be either `normal` or `forecast`.\n")
- }
- # verbose
- if (!is.numeric(verbose) || !(verbose %in% c(0, 1, 2))) {
- stop("`verbose` must be either `0` (no verbosity), `1` (low verbosity), or `2` (high verbosity).")
- }
- # parameters only used for type "forecast"
- if (type == "forecast") {
- if (!(is.wholenumber(horizon) && all(horizon > 0))) {
- stop("`horizon` must be a vector (or scalar) of positive integers.\n")
- }
+#' @keywords internal
+check_and_set_parameters <- function(internal) {
+ # Check groups
+ feature_names <- internal$parameters$feature_names
+ group <- internal$parameters$group
- if (any(horizon != output_size)) {
- stop(paste0("`horizon` must match the output size of the model (", paste0(output_size, collapse = ", "), ").\n"))
- }
+ if (!is.null(group)) check_groups(feature_names, group)
- if (!(length(train_idx) > 1 && is.wholenumber(train_idx) && all(train_idx > 0) && all(is.finite(train_idx)))) {
- stop("`train_idx` must be a vector of positive finite integers and length > 1.\n")
- }
+ # Adjust max_n_coalitions
+ internal$parameters$max_n_coalitions <- adjust_max_n_coalitions(internal)
- if (!(is.wholenumber(explain_idx) && all(explain_idx > 0) && all(is.finite(explain_idx)))) {
- stop("`explain_idx` must be a vector of positive finite integers.\n")
+ check_max_n_coalitions_fc(internal)
+
+ internal <- check_and_set_adaptive(internal) # sets the adaptive parameter if it is NULL (default)
+
+ internal <- set_exact(internal)
+
+ check_computability(internal)
+
+ # Check approach
+ check_approach(internal)
+
+ # Check regression if we are doing regression
+ if (internal$parameters$regression) internal <- check_regression(internal)
+
+ return(internal)
+}
+
+
+#' @keywords internal
+adjust_max_n_coalitions <- function(internal) {
+ is_groupwise <- internal$parameters$is_groupwise
+ max_n_coalitions <- internal$parameters$max_n_coalitions
+ n_features <- internal$parameters$n_features
+ n_groups <- internal$parameters$n_groups
+
+
+ # Adjust max_n_coalitions
+ if (isFALSE(is_groupwise)) { # feature wise
+ # Set max_n_coalitions to upper bound
+ if (is.null(max_n_coalitions) || max_n_coalitions > 2^n_features) {
+ max_n_coalitions <- 2^n_features
+ message(
+ paste0(
+ "Success with message:\n",
+ "max_n_coalitions is NULL or larger than or 2^n_features = ", 2^n_features, ", \n",
+ "and is therefore set to 2^n_features = ", 2^n_features, ".\n"
+ )
+ )
+ }
+ # Set max_n_coalitions to lower bound
+ if (isFALSE(is.null(max_n_coalitions)) && max_n_coalitions < min(10, n_features + 1)) {
+ if (n_features <= 3) {
+ max_n_coalitions <- 2^n_features
+ message(
+ paste0(
+ "Success with message:\n",
+ "n_features is smaller than or equal to 3, meaning there are so few unique coalitions (",
+ 2^n_features, ") that we should use all to get reliable results.\n",
+ "max_n_coalitions is therefore set to 2^n_features = ", 2^n_features, ".\n"
+ )
+ )
+ } else {
+ max_n_coalitions <- min(10, n_features + 1)
+ message(
+ paste0(
+ "Success with message:\n",
+ "max_n_coalitions is smaller than max(10, n_features + 1 = ", n_features + 1, "),",
+ "which will result in unreliable results.\n",
+ "It is therefore set to ", max(10, n_features + 1), ".\n"
+ )
+ )
+ }
+ }
+ } else { # group wise
+ # Set max_n_coalitions to upper bound
+ if (is.null(max_n_coalitions) || max_n_coalitions > 2^n_groups) {
+ max_n_coalitions <- 2^n_groups
+ message(
+ paste0(
+ "Success with message:\n",
+ "max_n_coalitions is NULL or larger than or 2^n_groups = ", 2^n_groups, ", \n",
+ "and is therefore set to 2^n_groups = ", 2^n_groups, ".\n"
+ )
+ )
}
+ # Set max_n_coalitions to lower bound
+ if (isFALSE(is.null(max_n_coalitions)) && max_n_coalitions < min(10, n_groups + 1)) {
+ if (n_groups <= 3) {
+ max_n_coalitions <- 2^n_groups
+ message(
+ paste0(
+ "Success with message:\n",
+ "n_groups is smaller than or equal to 3, meaning there are so few unique coalitions (", 2^n_groups, ") ",
+ "that we should use all to get reliable results.\n",
+ "max_n_coalitions is therefore set to 2^n_groups = ", 2^n_groups, ".\n"
+ )
+ )
+ } else {
+ max_n_coalitions <- min(10, n_groups + 1)
+ message(
+ paste0(
+ "Success with message:\n",
+ "max_n_coalitions is smaller than max(10, n_groups + 1 = ", n_groups + 1, "),",
+ "which will result in unreliable results.\n",
+ "It is therefore set to ", max(10, n_groups + 1), ".\n"
+ )
+ )
+ }
+ }
+ }
- if (!(is.wholenumber(explain_y_lags) && all(explain_y_lags >= 0) && all(is.finite(explain_y_lags)))) {
- stop("`explain_y_lags` must be a vector of positive finite integers.\n")
+
+ return(max_n_coalitions)
+}
+
+check_max_n_coalitions_fc <- function(internal) {
+ is_groupwise <- internal$parameters$is_groupwise
+ max_n_coalitions <- internal$parameters$max_n_coalitions
+ n_features <- internal$parameters$n_features
+ n_groups <- internal$parameters$n_groups
+
+ type <- internal$parameters$type
+
+ if (type == "forecast") {
+ horizon <- internal$parameters$horizon
+ explain_y_lags <- internal$parameters$explain_lags$y
+ explain_xreg_lags <- internal$parameters$explain_lags$xreg
+ xreg <- internal$data$xreg
+
+ if (!is_groupwise) {
+ if (max_n_coalitions <= n_features) {
+ stop(paste0(
+ "`max_n_coalitions` (", max_n_coalitions, ") has to be greater than the number of ",
+ "components to decompose the forecast onto:\n",
+ "`horizon` (", horizon, ") + `explain_y_lags` (", explain_y_lags, ") ",
+ "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ").\n"
+ ))
+ }
+ } else {
+ if (max_n_coalitions <= n_groups) {
+ stop(paste0(
+ "`max_n_coalitions` (", max_n_coalitions, ") has to be greater than the number of ",
+ "components to decompose the forecast onto:\n",
+ "ncol(`xreg`) (", ncol(`xreg`), ") + 1"
+ ))
+ }
}
+ }
+}
- if (!(is.wholenumber(explain_xreg_lags) && all(explain_xreg_lags >= 0) && all(is.finite(explain_xreg_lags)))) {
- stop("`explain_xreg_lags` must be a vector of positive finite integers.\n")
+check_and_set_adaptive <- function(internal) {
+ adaptive <- internal$parameters$adaptive
+ approach <- internal$parameters$approach
+
+ # Always adaptive = FALSE for vaeac and regression_surrogate
+ if (any(approach %in% c("vaeac", "regression_surrogate"))) {
+ unsupported <- approach[approach %in% c("vaeac", "regression_surrogate")]
+
+ if (isTRUE(adaptive)) {
+ warning(
+ paste0(
+ "Adaptive estimation of Shapley values are not supported for approach = ",
+ paste0(unsupported, collapse = ", "), ". Setting adaptive = FALSE."
+ )
+ )
}
- if (!(is.logical(group_lags) && length(group_lags) == 1)) {
- stop("`group_lags` must be a single logical.\n")
+ internal$parameters$adaptive <- FALSE
+ } else {
+ # Sets the default value of adaptive to TRUE if computing more than 5 Shapley values for all other approaches
+ if (is.null(adaptive)) {
+ n_shapley_values <- internal$parameters$n_shapley_values # n_features if feature-wise and n_groups if group-wise
+ internal$parameters$adaptive <- isTRUE(n_shapley_values > 5)
}
}
- # Parameter used in the MSEv evaluation criterion
- if (!(is.logical(MSEv_uniform_comb_weights) && length(MSEv_uniform_comb_weights) == 1)) {
- stop("`MSEv_uniform_comb_weights` must be single logical.")
- }
+ return(internal)
+}
- #### Tests combining more than one parameter ####
- # prediction_zero vs output_size
- if (!all((is.numeric(prediction_zero)) &&
- all(length(prediction_zero) == output_size) &&
- all(!is.na(prediction_zero)))) {
- stop(paste0(
- "`prediction_zero` (", paste0(prediction_zero, collapse = ", "),
- ") must be numeric and match the output size of the model (",
- paste0(output_size, collapse = ", "), ")."
- ))
+
+set_exact <- function(internal) {
+ max_n_coalitions <- internal$parameters$max_n_coalitions
+ n_features <- internal$parameters$n_features
+ n_groups <- internal$parameters$n_groups
+ is_groupwise <- internal$parameters$is_groupwise
+ adaptive <- internal$parameters$adaptive
+
+ if (isFALSE(adaptive) &&
+ (
+ (isFALSE(is_groupwise) && max_n_coalitions == 2^n_features) ||
+ (isTRUE(is_groupwise) && max_n_coalitions == 2^n_groups)
+ )
+ ) {
+ exact <- TRUE
+ } else {
+ exact <- FALSE
}
- # Getting basic input parameters
- parameters <- list(
- approach = approach,
- prediction_zero = prediction_zero,
- n_combinations = n_combinations,
- group = group,
- n_samples = n_samples,
- n_batches = n_batches,
- seed = seed,
- keep_samp_for_vS = keep_samp_for_vS,
- is_python = is_python,
- output_size = output_size,
- type = type,
- horizon = horizon,
- group_lags = group_lags,
- MSEv_uniform_comb_weights = MSEv_uniform_comb_weights,
- timing = timing,
- verbose = verbose
- )
+ internal$parameters$exact <- exact
- # Getting additional parameters from ...
- parameters <- append(parameters, list(...))
+ return(internal)
+}
- # Setting exact based on n_combinations (TRUE if NULL)
- parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE)
- # Setting that we are using regression based the approach name (any in case several approaches)
- parameters$regression <- any(grepl("regression", parameters$approach))
+#' @keywords internal
+check_computability <- function(internal) {
+ is_groupwise <- internal$parameters$is_groupwise
+ max_n_coalitions <- internal$parameters$max_n_coalitions
+ n_features <- internal$parameters$n_features
+ n_groups <- internal$parameters$n_groups
+ exact <- internal$parameters$exact
- return(parameters)
+
+ # Force user to use a natural number for n_coalitions if m > 13
+ if (isTRUE(exact)) {
+ if (isFALSE(is_groupwise) && n_features > 13) {
+ warning(
+ paste0(
+ "Due to computation time, we recommend not computing Shapley values exactly \n",
+ "with all 2^n_features (", 2^n_features, ") coalitions for n_features > 13.\n",
+ "Consider reducing max_n_coalitions and enabling adaptive estimation with adaptive = TRUE.\n"
+ )
+ )
+ }
+ if (isTRUE(is_groupwise) && n_groups > 13) {
+ warning(
+ paste0(
+ "Due to computation time, we recommend not computing Shapley values exactly \n",
+ "with all 2^n_groups (", 2^n_groups, ") coalitions for n_groups > 13.\n",
+ "Consider reducing max_n_coalitions and enabling adaptive estimation with adaptive = TRUE.\n"
+ )
+ )
+ }
+ } else {
+ if (isFALSE(is_groupwise) && n_features > 30) {
+ warning(
+ "Due to computation time, we strongly recommend enabling adaptive estimation with adaptive = TRUE",
+ " when n_features > 30.\n",
+ )
+ }
+ if (isTRUE(is_groupwise) && n_groups > 30) {
+ warning(
+ "Due to computation time, we strongly recommend enabling adaptive estimation with adaptive = TRUE",
+ " when n_groups > 30.\n",
+ )
+ }
+ }
}
+
+
+
#' @keywords internal
-get_data <- function(x_train, x_explain) {
- # Check data object type
- stop_message <- ""
- if (!is.matrix(x_train) && !is.data.frame(x_train)) {
- stop_message <- paste0(stop_message, "x_train should be a matrix or a data.frame/data.table.\n")
- }
- if (!is.matrix(x_explain) && !is.data.frame(x_explain)) {
- stop_message <- paste0(stop_message, "x_explain should be a matrix or a data.frame/data.table.\n")
+check_approach <- function(internal) {
+ # Check length of approach
+
+ approach <- internal$parameters$approach
+ n_features <- internal$parameters$n_features
+ supported_approaches <- get_supported_approaches()
+
+ if (!(is.character(approach) &&
+ (length(approach) == 1 || length(approach) == n_features - 1) &&
+ all(is.element(approach, supported_approaches)))
+ ) {
+ stop(
+ paste0(
+ "`approach` must be one of the following: '", paste0(supported_approaches, collapse = "', '"), "'.\n",
+ "These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector ",
+ "of length one less than the number of features (", n_features - 1, ")."
+ )
+ )
}
- if (stop_message != "") {
- stop(stop_message)
+
+ if (length(approach) > 1 && any(grepl("regression", approach))) {
+ stop("The `regression_separate` and `regression_surrogate` approaches cannot be combined with other approaches.")
}
+}
- # Check column names
- if (all(is.null(colnames(x_train)))) {
- stop_message <- paste0(stop_message, "x_train misses column names.\n")
+#' Gets the implemented approaches
+#'
+#' @return Character vector.
+#' The names of the implemented approaches that can be passed to argument `approach` in [explain()].
+#'
+#' @export
+get_supported_approaches <- function() {
+ substring(rownames(attr(methods(prepare_data), "info")), first = 14)
+}
+
+
+
+
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+check_regression <- function(internal) {
+ # Check that the model outputs one-dimensional predictions
+ if (internal$parameters$output_size != 1) {
+ stop("`regression_separate` and `regression_surrogate` only support models with one-dimensional output")
}
- if (all(is.null(colnames(x_explain)))) {
- stop_message <- paste0(stop_message, "x_explain misses column names.\n")
+
+ # Check that we are NOT explaining a forecast model
+ if (internal$parameters$type == "forecast") {
+ stop("`regression_separate` and `regression_surrogate` does not support `forecast`.")
}
- if (stop_message != "") {
- stop(stop_message)
+
+ # Check that we are not to keep the Monte Carlo samples
+ if (internal$parameters$keep_samp_for_vS) {
+ stop(paste(
+ "`keep_samp_for_vS` must be `FALSE` for the `regression_separate` and `regression_surrogate`",
+ "approaches as there are no Monte Carlo samples to keep for these approaches."
+ ))
}
+ # Remove n_MC_samples if we are doing regression, as we are not doing MC sampling
+ internal$parameters$n_MC_samples <- NULL
- data <- list(
- x_train = data.table::as.data.table(x_train),
- x_explain = data.table::as.data.table(x_explain)
- )
+ return(internal)
}
-#' Fetches feature information from a given data set
-#'
-#' @param x matrix, data.frame or data.table The data to extract feature information from.
-#'
-#' @details This function is used to extract the feature information to be checked against the corresponding
-#' information extracted from the model and other data sets. The function is called from internally
-#'
-#' @return A list with the following elements:
-#' \describe{
-#' \item{labels}{character vector with the feature names to compute Shapley values for}
-#' \item{classes}{a named character vector with the labels as names and the class types as elements}
-#' \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements
-#' (NULL if the feature is not a factor)}
-#' }
-#' @author Martin Jullum
-#'
-#' @keywords internal
-#' @export
-#'
-#' @examples
-#' # Load example data
-#' data("airquality")
-#' airquality <- airquality[complete.cases(airquality), ]
-#' # Split data into test- and training data
-#' x_train <- head(airquality, -3)
-#' x_explain <- tail(airquality, 3)
-#' # Split data into test- and training data
-#' x_train <- data.table::as.data.table(head(airquality))
-#' x_train[, Temp := as.factor(Temp)]
-#' get_data_specs(x_train)
-get_data_specs <- function(x) {
- feature_specs <- list()
- feature_specs$labels <- names(x)
- feature_specs$classes <- unlist(lapply(x, class))
- feature_specs$factor_levels <- lapply(x, levels)
- # Defining all integer values as numeric
- feature_specs$classes[feature_specs$classes == "integer"] <- "numeric"
- return(feature_specs)
+
+
+
+
+compare_vecs <- function(vec1, vec2, vec_type, name1, name2) {
+ if (!identical(vec1, vec2)) {
+ if (is.null(names(vec1))) {
+ text_vec1 <- paste(vec1, collapse = ", ")
+ } else {
+ text_vec1 <- paste(names(vec1), vec1, sep = ": ", collapse = ", ")
+ }
+ if (is.null(names(vec2))) {
+ text_vec2 <- paste(vec2, collapse = ", ")
+ } else {
+ text_vec2 <- paste(names(vec2), vec1, sep = ": ", collapse = ", ")
+ }
+
+ stop(paste0(
+ "Feature ", vec_type, " are not identical for ", name1, " and ", name2, ".\n",
+ name1, " provided: ", text_vec1, ",\n",
+ name2, " provided: ", text_vec2, ".\n"
+ ))
+ }
}
+
+
#' Check that the group parameter has the right form and content
#'
#'
@@ -668,81 +980,341 @@ check_groups <- function(feature_names, group) {
}
}
+
+
+
#' @keywords internal
-check_approach <- function(internal) {
- # Check length of approach
+set_adaptive_parameters <- function(internal, prev_iter_list = NULL) {
+ adaptive <- internal$parameters$adaptive
- approach <- internal$parameters$approach
- n_features <- internal$parameters$n_features
- supported_approaches <- get_supported_approaches()
+ adaptive_arguments <- internal$parameters$adaptive_arguments
- if (!(is.character(approach) &&
- (length(approach) == 1 || length(approach) == n_features - 1) &&
- all(is.element(approach, supported_approaches)))
- ) {
+ adaptive_arguments <- utils::modifyList(get_adaptive_arguments_default(internal),
+ adaptive_arguments,
+ keep.null = TRUE
+ )
+
+ # Force setting the number of coalitions and iterations for non-adaptive method
+ if (isFALSE(adaptive)) {
+ adaptive_arguments$max_iter <- 1
+ adaptive_arguments$initial_n_coalitions <- adaptive_arguments$max_n_coalitions
+ }
+
+ check_adaptive_arguments(adaptive_arguments)
+
+ # Translate any null input
+ adaptive_arguments <- trans_null_adaptive_arguments(adaptive_arguments)
+
+ internal$parameters$adaptive_arguments <- adaptive_arguments
+
+ if (!is.null(prev_iter_list)) {
+ # Update internal with the iter_list from prev_shapr_object
+ internal$iter_list <- prev_iter_list
+
+ # Conveniently allow running non-adaptive estimation one step further
+ if (isFALSE(internal$parameters$adaptive)) {
+ internal$parameters$adaptive_arguments$max_iter <- length(internal$iter_list) + 1
+ internal$parameters$adaptive_arguments$reduction_factor_vec <- NULL
+ }
+
+ # Update convergence data with NEW adaptive arguments
+ internal <- check_convergence(internal)
+
+ # Check for convergence based on last iter_list with new adaptive arguments
+ check_vs_prev_shapr_object(internal)
+
+ # Prepare next iteration
+ internal <- prepare_next_iteration(internal)
+ } else {
+ internal$iter_list <- list()
+ internal$iter_list[[1]] <- list(
+ n_coalitions = adaptive_arguments$initial_n_coalitions,
+ new_n_coalitions = adaptive_arguments$initial_n_coalitions,
+ exact = internal$parameters$exact,
+ compute_sd = adaptive_arguments$compute_sd,
+ reduction_factor = adaptive_arguments$reduction_factor_vec[1],
+ n_batches = set_n_batches(adaptive_arguments$initial_n_coalitions, internal)
+ )
+ }
+
+ return(internal)
+}
+
+check_adaptive_arguments <- function(adaptive_arguments) {
+ list2env(adaptive_arguments, envir = environment())
+
+
+ # initial_n_coalitions
+ if (!(is.wholenumber(initial_n_coalitions) &&
+ length(initial_n_coalitions) == 1 &&
+ !is.na(initial_n_coalitions) &&
+ initial_n_coalitions <= max_n_coalitions &&
+ initial_n_coalitions > 2)) {
+ stop("`adaptive_arguments$initial_n_coalitions` must be a single integer between 2 and `max_n_coalitions`.")
+ }
+
+ # fixed_n_coalitions
+ if (!is.null(fixed_n_coalitions_per_iter) &&
+ !(is.wholenumber(fixed_n_coalitions_per_iter) &&
+ length(fixed_n_coalitions_per_iter) == 1 &&
+ !is.na(fixed_n_coalitions_per_iter) &&
+ fixed_n_coalitions_per_iter <= max_n_coalitions &&
+ fixed_n_coalitions_per_iter > 0)) {
+ stop(
+ "`adaptive_arguments$fixed_n_coalitions_per_iter` must be NULL or a single positive integer no larger than",
+ "`max_n_coalitions`."
+ )
+ }
+
+ # max_iter
+ if (!is.null(max_iter) &&
+ !((is.wholenumber(max_iter) || is.infinite(max_iter)) &&
+ length(max_iter) == 1 &&
+ !is.na(max_iter) &&
+ max_iter > 0)) {
+ stop("`adaptive_arguments$max_iter` must be NULL, Inf or a single positive integer.")
+ }
+
+ # convergence_tolerance
+ if (!is.null(convergence_tolerance) &&
+ !(length(convergence_tolerance) == 1 &&
+ !is.na(convergence_tolerance) &&
+ convergence_tolerance >= 0)) {
+ stop("`adaptive_arguments$convergence_tolerance` must be NULL, 0, or a positive numeric.")
+ }
+
+ # reduction_factor_vec
+ if (!is.null(reduction_factor_vec) &&
+ !(all(!is.na(reduction_factor_vec)) &&
+ all(reduction_factor_vec <= 1) &&
+ all(reduction_factor_vec >= 0))) {
+ stop("`adaptive_arguments$reduction_factor_vec` must be NULL or a vector or numerics between 0 and 1.")
+ }
+
+ # n_boot_samps
+ if (!(is.wholenumber(n_boot_samps) &&
+ length(n_boot_samps) == 1 &&
+ !is.na(n_boot_samps) &&
+ n_boot_samps > 0)) {
+ stop("`adaptive_arguments$n_boot_samps` must be a single positive integer.")
+ }
+
+ # compute_sd
+ if (!(is.logical(compute_sd) &&
+ length(compute_sd) == 1)) {
+ stop("`adaptive_arguments$compute_sd` must be a single logical.")
+ }
+
+
+ # min_n_batches
+ if (!is.null(min_n_batches) &&
+ !(is.wholenumber(min_n_batches) &&
+ length(min_n_batches) == 1 &&
+ !is.na(min_n_batches) &&
+ min_n_batches > 0)) {
+ stop("`adaptive_arguments$min_n_batches` must be NULL or a single positive integer.")
+ }
+
+ # max_batch_size
+ if (!is.null(max_batch_size) &&
+ !((is.wholenumber(max_batch_size) || is.infinite(max_batch_size)) &&
+ length(max_batch_size) == 1 &&
+ !is.na(max_batch_size) &&
+ max_batch_size > 0)) {
+ stop("`adaptive_arguments$max_batch_size` must be NULL, Inf or a single positive integer.")
+ }
+
+ # saving_path
+ if (!(is.character(saving_path) &&
+ length(saving_path) == 1)) {
+ stop("`adaptive_arguments$saving_path` must be a single character.")
+ }
+
+ # Check that the saving_path exists, and abort if not...
+ if (!dir.exists(dirname(saving_path))) {
stop(
paste0(
- "`approach` must be one of the following: '", paste0(supported_approaches, collapse = "', '"), "'.\n",
- "These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector ",
- "of length one less than the number of features (", n_features - 1, ")."
+ "Directory ", dirname(saving_path), " in the adaptive_arguments$saving_path does not exists.\n",
+ "Please create the directory with `dir.create('", dirname(saving_path), "')` or use another directory."
)
)
}
+}
- if (length(approach) > 1 && any(grepl("regression", approach))) {
- stop("The `regression_separate` and `regression_surrogate` approaches cannot be combined with other approaches.")
- }
+trans_null_adaptive_arguments <- function(adaptive_arguments) {
+ list2env(adaptive_arguments, envir = environment())
+
+ # Translating NULL to always return n_batches = 1 (if just one approach)
+ adaptive_arguments$min_n_batches <- ifelse(is.null(min_n_batches), 1, min_n_batches)
+ adaptive_arguments$max_batch_size <- ifelse(is.null(max_batch_size), Inf, max_batch_size)
+ adaptive_arguments$max_iter <- ifelse(is.null(max_iter), Inf, max_iter)
+
+ return(adaptive_arguments)
}
-#' @keywords internal
-set_defaults <- function(internal) {
- # Set defaults for certain arguments (based on other input)
- approach <- internal$parameters$approach
+set_n_batches <- function(n_coalitions, internal) {
+ min_n_batches <- internal$parameters$adaptive_arguments$min_n_batches
+ max_batch_size <- internal$parameters$adaptive_arguments$max_batch_size
n_unique_approaches <- internal$parameters$n_unique_approaches
- used_n_combinations <- internal$parameters$used_n_combinations
- n_batches <- internal$parameters$n_batches
- # n_batches
- if (is.null(n_batches)) {
- internal$parameters$n_batches <- get_default_n_batches(approach, n_unique_approaches, used_n_combinations)
- }
- return(internal)
+ # Restrict the sizes of the batches to max_batch_size, but require at least min_n_batches and n_unique_approaches
+ suggested_n_batches <- max(min_n_batches, n_unique_approaches, ceiling(n_coalitions / max_batch_size))
+
+ # Set n_batches to no less than n_coalitions
+ n_batches <- min(n_coalitions, suggested_n_batches)
+
+ return(n_batches)
}
-#' @keywords internal
-get_default_n_batches <- function(approach, n_unique_approaches, n_combinations) {
- used_approach <- names(sort(table(approach), decreasing = TRUE))[1] # Most frequent used approach (when more present)
+check_vs_prev_shapr_object <- function(internal) {
+ iter <- length(internal$iter_list)
+
+ converged <- internal$iter_list[[iter]]$converged
+ converged_exact <- internal$iter_list[[iter]]$converged_exact
+ converged_sd <- internal$iter_list[[iter]]$converged_sd
+ converged_max_iter <- internal$iter_list[[iter]]$converged_max_iter
+ converged_max_n_coalitions <- internal$iter_list[[iter]]$converged_max_n_coalitions
+
+ if (isTRUE(converged)) {
+ message0 <- "Convergence reached before estimation start.\n"
+ if (isTRUE(converged_exact)) {
+ message0 <- c(
+ message0,
+ "All coalitions estimated. No need for further estimation.\n"
+ )
+ }
+ if (isTRUE(converged_sd)) {
+ message0 <- c(
+ message0,
+ "Convergence tolerance reached. Consider decreasing `adaptive_arguments$tolerance`.\n"
+ )
+ }
+ if (isTRUE(converged_max_iter)) {
+ message0 <- c(
+ message0,
+ "Maximum number of iterations reached. Consider increasing `adaptive_arguments$max_iter`.\n"
+ )
+ }
+ if (isTRUE(converged_max_n_coalitions)) {
+ message0 <- c(
+ message0,
+ "Maximum number of coalitions reached. Consider increasing `max_n_coalitions`.\n"
+ )
+ }
+ stop(message0)
+ }
+}
+
+# Get functions ========================================================================================================
+#' Function to specify arguments of the adaptive estimation procedure
+#'
+#' @details The functions sets default values for the adaptive estimation procedure, according to the function defaults.
+#' If the argument `adaptive` of [shapr::explain()] is FALSE, it sets parameters corresponding to the use of a
+#' non-adaptive estimation procedure
+#'
+#' @param max_iter Integer. Maximum number of estimation iterations
+#' @param initial_n_coalitions Integer. Number of coalitions to use in the first estimation iteration.
+#' @param fixed_n_coalitions_per_iter Integer. Number of `n_coalitions` to use in each iteration.
+#' `NULL` (default) means setting it based on estimates based on a set convergence threshold.
+#' @param convergence_tolerance Numeric. The t variable in the convergence threshold formula on page 6 in the paper
+#' Covert and Lee (2021), 'Improving KernelSHAP: Practical Shapley Value Estimation via Linear Regression'
+#' https://arxiv.org/pdf/2012.01536. Smaller values requires more coalitions before convergence is reached.
+#' @param reduction_factor_vec Numeric vector. The number of `n_coalitions` that must be used to reach convergence
+#' in the next iteration is estimated.
+#' The number of `n_coalitions` actually used in the next iteration is set to this estimate multiplied by
+#' `reduction_factor_vec[i]` for iteration `i`.
+#' It is wise to start with smaller numbers to avoid using too many `n_coalitions` due to uncertain estimates in
+#' the first iterations.
+#' @param n_boot_samps Integer. The number of bootstrapped samples (i.e. samples with replacement) from the set of all
+#' coalitions used to estimate the standard deviations of the Shapley value estimates.
+#' @param compute_sd Logical. Whether to estimate the standard deviations of the Shapley value estimates.
+#' @param max_batch_size Integer. The maximum number of coalitions to estimate simultaneously within each iteration.
+#' A larger numbers requires more memory, but may have a slight computational advantage.
+#' @param min_n_batches Integer. The minimum number of batches to split the computation into within each iteration.
+#' Larger numbers gives more frequent progress updates. If parallelization is applied, this should be set no smaller
+#' than the number of parallel workers.
+#' @param saving_path String.
+#' The path to the directory where the results of the adaptive estimation procedure should be saved.
+#' Defaults to a temporary directory.
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @author Martin Jullum
+get_adaptive_arguments_default <- function(internal,
+ initial_n_coalitions = ceiling(
+ min(
+ 200,
+ max(
+ 5,
+ internal$parameters$n_features,
+ (2^internal$parameters$n_features) / 10
+ )
+ )
+ ),
+ fixed_n_coalitions_per_iter = NULL,
+ max_iter = 20,
+ convergence_tolerance = 0.02,
+ reduction_factor_vec = c(seq(0.1, 1, by = 0.1), rep(1, max_iter - 10)),
+ n_boot_samps = 100,
+ compute_sd = isTRUE(internal$parameters$adaptive),
+ max_batch_size = 10,
+ min_n_batches = 10,
+ saving_path = tempfile("shapr_obj_", fileext = ".rds")) {
+ adaptive <- internal$parameters$adaptive
+ max_n_coalitions <- internal$parameters$max_n_coalitions
+ exact <- internal$parameters$exact
+ is_groupwise <- internal$parameters$is_groupwise
- if (used_approach %in% c("ctree", "gaussian", "copula")) {
- suggestion <- ceiling(n_combinations / 10)
- this_min <- 10
- this_max <- 1000
+ if (isTRUE(adaptive)) {
+ ret_list <- mget(
+ c(
+ "initial_n_coalitions",
+ "fixed_n_coalitions_per_iter",
+ "max_n_coalitions",
+ "max_iter",
+ "convergence_tolerance",
+ "reduction_factor_vec",
+ "n_boot_samps",
+ "compute_sd",
+ "max_batch_size",
+ "min_n_batches",
+ "saving_path"
+ )
+ )
} else {
- suggestion <- ceiling(n_combinations / 100)
- this_min <- 2
- this_max <- 100
- }
- min_checked <- max(c(this_min, suggestion, n_unique_approaches))
- ret <- min(c(this_max, min_checked, n_combinations - 1))
- message(
- paste0(
- "Setting parameter 'n_batches' to ", ret, " as a fair trade-off between memory consumption and ",
- "computation time.\n",
- "Reducing 'n_batches' typically reduces the computation time at the cost of increased memory consumption.\n"
+ ret_list <- list(
+ initial_n_coalitions = max_n_coalitions,
+ fixed_n_coalitions_per_iter = NULL,
+ max_n_coalitions = max_n_coalitions,
+ max_iter = 1,
+ convergence_tolerance = NULL,
+ reduction_factor_vec = NULL,
+ n_boot_samps = n_boot_samps,
+ compute_sd = isFALSE(exact) && isFALSE(is_groupwise),
+ max_batch_size = max_batch_size,
+ min_n_batches = min_n_batches,
+ saving_path = saving_path
)
- )
- return(ret)
+ }
+ return(ret_list)
}
-
-#' Gets the implemented approaches
+#' Additional setup for regression-based methods
#'
-#' @return Character vector.
-#' The names of the implemented approaches that can be passed to argument `approach` in [explain()].
+#' @inheritParams default_doc_explain
#'
#' @export
-get_supported_approaches <- function() {
- substring(rownames(attr(methods(prepare_data), "info")), first = 14)
+#' @keywords internal
+additional_regression_setup <- function(internal, model, predict_model) {
+ # This step needs to be called after predict_model is set, and therefore arrives at a later stage in explain()
+
+ # Add the predicted response of the training and explain data to the internal list for regression-based methods.
+ # Use isTRUE as `regression` is not present (NULL) for non-regression methods (i.e., Monte Carlo-based methods).
+ if (isTRUE(internal$parameters$regression)) {
+ internal <- regression.get_y_hat(internal = internal, model = model, predict_model = predict_model)
+ }
+
+ return(internal)
}
diff --git a/R/setup_computation.R b/R/setup_computation.R
deleted file mode 100644
index dad9b6240..000000000
--- a/R/setup_computation.R
+++ /dev/null
@@ -1,689 +0,0 @@
-#' Sets up everything for the Shapley values computation in [shapr::explain()]
-#'
-#' @inheritParams default_doc
-#' @inheritParams explain
-#' @inherit default_doc
-#' @export
-setup_computation <- function(internal, model, predict_model) {
- # model and predict_model are only needed for type AICc of approach empirical, otherwise ignored
- type <- internal$parameters$type
-
- # setup the Shapley framework
- internal <- if (type == "forecast") shapley_setup_forecast(internal) else shapley_setup(internal)
-
- # Setup for approach
- internal <- setup_approach(internal, model = model, predict_model = predict_model)
-
- return(internal)
-}
-
-#' @keywords internal
-shapley_setup_forecast <- function(internal) {
- exact <- internal$parameters$exact
- n_features0 <- internal$parameters$n_features
- n_combinations <- internal$parameters$n_combinations
- is_groupwise <- internal$parameters$is_groupwise
- group_num <- internal$objects$group_num
- horizon <- internal$parameters$horizon
- feature_names <- internal$parameters$feature_names
-
- X_list <- W_list <- list()
-
- # Find columns/features to be included in each of the different horizons
- col_del_list <- list()
- col_del_list[[1]] <- numeric()
- if (horizon > 1) {
- k <- 2
- for (i in rev(seq_len(horizon)[-1])) {
- col_del_list[[k]] <- c(unlist(col_del_list[[k - 1]]), grep(paste0(".F", i), feature_names))
- k <- k + 1
- }
- }
-
- cols_per_horizon <- lapply(rev(col_del_list), function(x) if (length(x) > 0) feature_names[-x] else feature_names)
-
- horizon_features <- lapply(cols_per_horizon, function(x) which(internal$parameters$feature_names %in% x))
-
- # Apply feature_combination, weigth_matrix and feature_matrix_cpp to each of the different horizons
- for (i in seq_along(horizon_features)) {
- this_featcomb <- horizon_features[[i]]
- n_this_featcomb <- length(this_featcomb)
-
- this_group_num <- lapply(group_num, function(x) x[x %in% this_featcomb])
-
- X_list[[i]] <- feature_combinations(
- m = n_this_featcomb,
- exact = exact,
- n_combinations = n_combinations,
- weight_zero_m = 10^6,
- group_num = this_group_num
- )
-
- W_list[[i]] <- weight_matrix(
- X = X_list[[i]],
- normalize_W_weights = TRUE,
- is_groupwise = is_groupwise
- )
- }
-
- # Merge the feature combination data.table to single one to use for computing conditional expectations later on
- X <- rbindlist(X_list, idcol = "horizon")
- X[, N := NA]
- X[, shapley_weight := NA]
- data.table::setorderv(X, c("n_features", "horizon"), order = c(1, -1))
- X[, horizon_id_combination := id_combination]
- X[, id_combination := 0]
- X[!duplicated(features), id_combination := .I]
- X[, tmp_features := as.character(features)]
- X[, id_combination := max(id_combination), by = tmp_features]
- X[, tmp_features := NULL]
-
- # Extracts a data.table allowing mapping from X to X_list/W_list to be used in the compute_shapley function
- id_combination_mapper_dt <- X[, .(horizon, horizon_id_combination, id_combination)]
-
- X[, horizon := NULL]
- X[, horizon_id_combination := NULL]
- data.table::setorder(X, n_features)
- X <- X[!duplicated(id_combination)]
-
- W <- NULL # Included for consistency. Necessary weights are in W_list instead
-
- ## Get feature matrix ---------
- S <- feature_matrix_cpp(
- features = X[["features"]],
- m = n_features0
- )
-
-
- #### Updating parameters ####
-
- # Updating parameters$exact as done in feature_combinations
- if (!exact && n_combinations >= 2^n_features0) {
- internal$parameters$exact <- TRUE # Note that this is exact only if all horizons use the exact method.
- }
-
- internal$parameters$n_combinations <- nrow(S) # Updating this parameter in the end based on what is actually used.
-
- # This will be obsolete later
- internal$parameters$group_num <- NULL # TODO: Checking whether I could just do this processing where needed
- # instead of storing it
-
- internal$objects$X <- X
- internal$objects$W <- W
- internal$objects$S <- S
- internal$objects$S_batch <- create_S_batch_new(internal)
-
- internal$objects$id_combination_mapper_dt <- id_combination_mapper_dt
- internal$objects$cols_per_horizon <- cols_per_horizon
- internal$objects$W_list <- W_list
- internal$objects$X_list <- X_list
-
-
- return(internal)
-}
-
-
-#' @keywords internal
-shapley_setup <- function(internal) {
- exact <- internal$parameters$exact
- n_features0 <- internal$parameters$n_features
- n_combinations <- internal$parameters$n_combinations
- is_groupwise <- internal$parameters$is_groupwise
-
- group_num <- internal$objects$group_num
-
- X <- feature_combinations(
- m = n_features0,
- exact = exact,
- n_combinations = n_combinations,
- weight_zero_m = 10^6,
- group_num = group_num
- )
-
- # Get weighted matrix ----------------
- W <- weight_matrix(
- X = X,
- normalize_W_weights = TRUE,
- is_groupwise = is_groupwise
- )
-
- ## Get feature matrix ---------
- S <- feature_matrix_cpp(
- features = X[["features"]],
- m = n_features0
- )
-
- #### Updating parameters ####
-
- # Updating parameters$exact as done in feature_combinations
- if (!exact && n_combinations >= 2^n_features0) {
- internal$parameters$exact <- TRUE
- }
-
- internal$parameters$n_combinations <- nrow(S) # Updating this parameter in the end based on what is actually used.
-
- # This will be obsolete later
- internal$parameters$group_num <- NULL # TODO: Checking whether I could just do this processing where needed
- # instead of storing it
-
- internal$objects$X <- X
- internal$objects$W <- W
- internal$objects$S <- S
- internal$objects$S_batch <- create_S_batch_new(internal)
-
-
- return(internal)
-}
-
-#' Define feature combinations, and fetch additional information about each unique combination
-#'
-#' @param m Positive integer. Total number of features.
-#' @param exact Logical. If `TRUE` all `2^m` combinations are generated, otherwise a
-#' subsample of the combinations is used.
-#' @param n_combinations Positive integer. Note that if `exact = TRUE`,
-#' `n_combinations` is ignored. However, if `m > 12` you'll need to add a positive integer
-#' value for `n_combinations`.
-#' @param weight_zero_m Numeric. The value to use as a replacement for infinite combination
-#' weights when doing numerical operations.
-#' @param group_num List. Contains vector of integers indicating the feature numbers for the
-#' different groups.
-#'
-#' @return A data.table that contains the following columns:
-#' \describe{
-#' \item{id_combination}{Positive integer. Represents a unique key for each combination. Note that the table
-#' is sorted by `id_combination`, so that is always equal to `x[["id_combination"]] = 1:nrow(x)`.}
-#' \item{features}{List. Each item of the list is an integer vector where `features[[i]]`
-#' represents the indices of the features included in combination `i`. Note that all the items
-#' are sorted such that `features[[i]] == sort(features[[i]])` is always true.}
-#' \item{n_features}{Vector of positive integers. `n_features[i]` equals the number of features in combination
-#' `i`, i.e. `n_features[i] = length(features[[i]])`.}.
-#' \item{N}{Positive integer. The number of unique ways to sample `n_features[i]` features
-#' from `m` different features, without replacement.}
-#' }
-#'
-#' @export
-#'
-#' @author Nikolai Sellereite, Martin Jullum
-#'
-#' @examples
-#' # All combinations
-#' x <- feature_combinations(m = 3)
-#' nrow(x) # Equals 2^3 = 8
-#'
-#' # Subsample of combinations
-#' x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2)
-feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, group_num = NULL) {
- m_group <- length(group_num) # The number of groups
-
- # Force user to use a natural number for n_combinations if m > 13
- if (m > 13 && is.null(n_combinations) && m_group == 0) {
- stop(
- paste0(
- "Due to computational complexity, we recommend setting n_combinations = 10 000\n",
- "if the number of features is larger than 13 for feature-wise Shapley values.\n",
- "Note that you can force the use of the exact method (i.e. n_combinations = NULL)\n",
- "by setting n_combinations equal to 2^m where m is the number of features.\n"
- )
- )
- }
-
- # Not supported for m > 30
- if (m > 30 && m_group == 0) {
- stop(
- paste0(
- "Currently we are not supporting cases where the number of features is greater than 30\n",
- "for feature-wise Shapley values.\n"
- )
- )
- }
- if (m_group > 30) {
- stop(
- paste0(
- "For computational reasons, we are currently not supporting group-wise Shapley values \n",
- "for more than 30 groups. Please reduce the number of groups.\n"
- )
- )
- }
-
- if (!exact) {
- if (m_group == 0) {
- # Switch to exact for feature-wise method
- if (n_combinations >= 2^m) {
- n_combinations <- 2^m
- exact <- TRUE
- message(
- paste0(
- "Success with message:\n",
- "n_combinations is larger than or equal to 2^m = ", 2^m, ". \n",
- "Using exact instead.\n"
- )
- )
- }
- } else {
- # Switch to exact for feature-wise method
- if (n_combinations >= (2^m_group)) {
- n_combinations <- 2^m_group
- exact <- TRUE
- message(
- paste0(
- "Success with message:\n",
- "n_combinations is larger than or equal to 2^group_num = ", 2^m_group, ". \n",
- "Using exact instead.\n"
- )
- )
- }
- }
- }
-
- if (m_group == 0) {
- # Here if feature-wise Shapley values
- if (exact) {
- dt <- feature_exact(m, weight_zero_m)
- } else {
- dt <- feature_not_exact(m, n_combinations, weight_zero_m)
- stopifnot(
- data.table::is.data.table(dt),
- !is.null(dt[["p"]])
- )
- p <- NULL # due to NSE notes in R CMD check
- dt[, p := NULL]
- }
- } else {
- # Here if group-wise Shapley values
- if (exact) {
- dt <- feature_group(group_num, weight_zero_m)
- } else {
- dt <- feature_group_not_exact(group_num, n_combinations, weight_zero_m)
- stopifnot(
- data.table::is.data.table(dt),
- !is.null(dt[["p"]])
- )
- p <- NULL # due to NSE notes in R CMD check
- dt[, p := NULL]
- }
- }
- return(dt)
-}
-
-#' @keywords internal
-feature_exact <- function(m, weight_zero_m = 10^6) {
- dt <- data.table::data.table(id_combination = seq(2^m))
- combinations <- lapply(0:m, utils::combn, x = m, simplify = FALSE)
- dt[, features := unlist(combinations, recursive = FALSE)]
- dt[, n_features := length(features[[1]]), id_combination]
- dt[, N := .N, n_features]
- dt[, shapley_weight := shapley_weights(m = m, N = N, n_components = n_features, weight_zero_m)]
-
- return(dt)
-}
-
-#' @keywords internal
-feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, unique_sampling = TRUE) {
- # Find weights for given number of features ----------
- n_features <- seq(m - 1)
- n <- sapply(n_features, choose, n = m)
- w <- shapley_weights(m = m, N = n, n_features) * n
- p <- w / sum(w)
-
- feature_sample_all <- list()
- unique_samples <- 0
-
-
- if (unique_sampling) {
- while (unique_samples < n_combinations - 2) {
- # Sample number of chosen features ----------
- n_features_sample <- sample(
- x = n_features,
- size = n_combinations - unique_samples - 2, # Sample -2 as we add zero and m samples below
- replace = TRUE,
- prob = p
- )
-
- # Sample specific set of features -------
- feature_sample <- sample_features_cpp(m, n_features_sample)
- feature_sample_all <- c(feature_sample_all, feature_sample)
- unique_samples <- length(unique(feature_sample_all))
- }
- } else {
- n_features_sample <- sample(
- x = n_features,
- size = n_combinations - 2, # Sample -2 as we add zero and m samples below
- replace = TRUE,
- prob = p
- )
- feature_sample_all <- sample_features_cpp(m, n_features_sample)
- }
-
- # Add zero and m features
- feature_sample_all <- c(list(integer(0)), feature_sample_all, list(c(1:m)))
- X <- data.table(n_features = sapply(feature_sample_all, length))
- X[, n_features := as.integer(n_features)]
-
- # Get number of occurences and duplicated rows-------
- is_duplicate <- NULL # due to NSE notes in R CMD check
- r <- helper_feature(m, feature_sample_all)
- X[, is_duplicate := r[["is_duplicate"]]]
-
- # When we sample combinations the Shapley weight is equal
- # to the frequency of the given combination
- X[, shapley_weight := r[["sample_frequence"]]]
-
- # Populate table and remove duplicated rows -------
- X[, features := feature_sample_all]
- if (any(X[["is_duplicate"]])) {
- X <- X[is_duplicate == FALSE]
- }
- X[, is_duplicate := NULL]
- data.table::setkeyv(X, "n_features")
-
- # Make feature list into character
- X[, features_tmp := sapply(features, paste, collapse = " ")]
-
- # Aggregate weights by how many samples of a combination we observe
- X <- X[, .(
- n_features = data.table::first(n_features),
- shapley_weight = sum(shapley_weight),
- features = features[1]
- ), features_tmp]
-
- X[, features_tmp := NULL]
- data.table::setorder(X, n_features)
-
- # Add shapley weight and number of combinations
- X[c(1, .N), shapley_weight := weight_zero_m]
- X[, N := 1]
- ind <- X[, .I[data.table::between(n_features, 1, m - 1)]]
- X[ind, p := p[n_features]]
- X[ind, N := n[n_features]]
-
- # Set column order and key table
- data.table::setkeyv(X, "n_features")
- X[, id_combination := .I]
- X[, N := as.integer(N)]
- nms <- c("id_combination", "features", "n_features", "N", "shapley_weight", "p")
- data.table::setcolorder(X, nms)
-
- return(X)
-}
-
-#' Calculate Shapley weight
-#'
-#' @param m Positive integer. Total number of features/feature groups.
-#' @param n_components Positive integer. Represents the number of features/feature groups you want to sample from
-#' a feature space consisting of `m` unique features/feature groups. Note that ` 0 < = n_components <= m`.
-#' @param N Positive integer. The number of unique combinations when sampling `n_components` features/feature
-#' groups, without replacement, from a sample space consisting of `m` different features/feature groups.
-#' @param weight_zero_m Positive integer. Represents the Shapley weight for two special
-#' cases, i.e. the case where you have either `0` or `m` features/feature groups.
-#'
-#' @return Numeric
-#' @keywords internal
-#'
-#' @author Nikolai Sellereite
-shapley_weights <- function(m, N, n_components, weight_zero_m = 10^6) {
- x <- (m - 1) / (N * n_components * (m - n_components))
- x[!is.finite(x)] <- weight_zero_m
- x
-}
-
-
-#' @keywords internal
-helper_feature <- function(m, feature_sample) {
- x <- feature_matrix_cpp(feature_sample, m)
- dt <- data.table::data.table(x)
- cnms <- paste0("V", seq(m))
- data.table::setnames(dt, cnms)
- dt[, sample_frequence := as.integer(.N), by = cnms]
- dt[, is_duplicate := duplicated(dt)]
- dt[, (cnms) := NULL]
-
- return(dt)
-}
-
-
-#' Analogue to feature_exact, but for groups instead.
-#'
-#' @inheritParams shapley_weights
-#' @param group_num List. Contains vector of integers indicating the feature numbers for the
-#' different groups.
-#'
-#' @return data.table with all feature group combinations, shapley weights etc.
-#'
-#' @keywords internal
-feature_group <- function(group_num, weight_zero_m = 10^6) {
- m <- length(group_num)
- dt <- data.table::data.table(id_combination = seq(2^m))
- combinations <- lapply(0:m, utils::combn, x = m, simplify = FALSE)
-
- dt[, groups := unlist(combinations, recursive = FALSE)]
- dt[, features := lapply(groups, FUN = group_fun, group_num = group_num)]
- dt[, n_groups := length(groups[[1]]), id_combination]
- dt[, n_features := length(features[[1]]), id_combination]
- dt[, N := .N, n_groups]
- dt[, shapley_weight := shapley_weights(m = m, N = N, n_components = n_groups, weight_zero_m)]
-
- return(dt)
-}
-
-#' @keywords internal
-group_fun <- function(x, group_num) {
- if (length(x) != 0) {
- unlist(group_num[x])
- } else {
- integer(0)
- }
-}
-
-
-#' Analogue to feature_not_exact, but for groups instead.
-#'
-#' Analogue to feature_not_exact, but for groups instead.
-#'
-#' @inheritParams shapley_weights
-#' @inheritParams feature_group
-#'
-#' @return data.table with all feature group combinations, shapley weights etc.
-#'
-#' @keywords internal
-feature_group_not_exact <- function(group_num, n_combinations = 200, weight_zero_m = 10^6) {
- # Find weights for given number of features ----------
- m <- length(group_num)
- n_groups <- seq(m - 1)
- n <- sapply(n_groups, choose, n = m)
- w <- shapley_weights(m = m, N = n, n_groups) * n
- p <- w / sum(w)
-
- # Sample number of chosen features ----------
- feature_sample_all <- list()
- unique_samples <- 0
-
- while (unique_samples < n_combinations - 2) {
- # Sample number of chosen features ----------
- n_features_sample <- sample(
- x = n_groups,
- size = n_combinations - unique_samples - 2, # Sample -2 as we add zero and m samples below
- replace = TRUE,
- prob = p
- )
-
- # Sample specific set of features -------
- feature_sample <- sample_features_cpp(m, n_features_sample)
- feature_sample_all <- c(feature_sample_all, feature_sample)
- unique_samples <- length(unique(feature_sample_all))
- }
-
- # Add zero and m features
- feature_sample_all <- c(list(integer(0)), feature_sample_all, list(c(1:m)))
- X <- data.table(n_groups = sapply(feature_sample_all, length))
- X[, n_groups := as.integer(n_groups)]
-
-
- # Get number of occurences and duplicated rows-------
- is_duplicate <- NULL # due to NSE notes in R CMD check
- r <- helper_feature(m, feature_sample_all)
- X[, is_duplicate := r[["is_duplicate"]]]
-
- # When we sample combinations the Shapley weight is equal
- # to the frequency of the given combination
- X[, shapley_weight := r[["sample_frequence"]]]
-
- # Populate table and remove duplicated rows -------
- X[, groups := feature_sample_all]
- if (any(X[["is_duplicate"]])) {
- X <- X[is_duplicate == FALSE]
- }
- X[, is_duplicate := NULL]
-
- # Make group list into character
- X[, groups_tmp := sapply(groups, paste, collapse = " ")]
-
- # Aggregate weights by how many samples of a combination we have
- X <- X[, .(
- n_groups = data.table::first(n_groups),
- shapley_weight = sum(shapley_weight),
- groups = groups[1]
- ), groups_tmp]
-
- X[, groups_tmp := NULL]
- data.table::setorder(X, n_groups)
-
-
- # Add shapley weight and number of combinations
- X[c(1, .N), shapley_weight := weight_zero_m]
- X[, N := 1]
- ind <- X[, .I[data.table::between(n_groups, 1, m - 1)]]
- X[ind, p := p[n_groups]]
- X[ind, N := n[n_groups]]
-
- # Adding feature info
- X[, features := lapply(groups, FUN = group_fun, group_num = group_num)]
- X[, n_features := sapply(X$features, length)]
-
- # Set column order and key table
- data.table::setkeyv(X, "n_groups")
- X[, id_combination := .I]
- X[, N := as.integer(N)]
- nms <- c("id_combination", "groups", "features", "n_groups", "n_features", "N", "shapley_weight", "p")
- data.table::setcolorder(X, nms)
-
- return(X)
-}
-
-#' Calculate weighted matrix
-#'
-#' @param X data.table
-#' @param normalize_W_weights Logical. Whether to normalize the weights for the combinations to sum to 1 for
-#' increased numerical stability before solving the WLS (weighted least squares). Applies to all combinations
-#' except combination `1` and `2^m`.
-#' @param is_groupwise Logical. Indicating whether group wise Shapley values are to be computed.
-#'
-#' @return Numeric matrix. See [weight_matrix_cpp()] for more information.
-#' @keywords internal
-#'
-#' @author Nikolai Sellereite, Martin Jullum
-weight_matrix <- function(X, normalize_W_weights = TRUE, is_groupwise = FALSE) {
- # Fetch weights
- w <- X[["shapley_weight"]]
-
- if (normalize_W_weights) {
- w[-c(1, length(w))] <- w[-c(1, length(w))] / sum(w[-c(1, length(w))])
- }
-
- if (!is_groupwise) {
- W <- weight_matrix_cpp(
- subsets = X[["features"]],
- m = X[.N][["n_features"]],
- n = X[, .N],
- w = w
- )
- } else {
- W <- weight_matrix_cpp(
- subsets = X[["groups"]],
- m = X[.N][["n_groups"]],
- n = X[, .N],
- w = w
- )
- }
-
- return(W)
-}
-
-#' @keywords internal
-create_S_batch_new <- function(internal, seed = NULL) {
- n_features0 <- internal$parameters$n_features
- approach0 <- internal$parameters$approach
- n_combinations <- internal$parameters$n_combinations
- n_batches <- internal$parameters$n_batches
-
- X <- internal$objects$X
-
- if (!is.null(seed)) set.seed(seed)
-
- if (length(approach0) > 1) {
- X[!(n_features %in% c(0, n_features0)), approach := approach0[n_features]]
-
- # Finding the number of batches per approach
- batch_count_dt <- X[!is.na(approach), list(
- n_batches_per_approach =
- pmax(1, round(.N / (n_combinations - 2) * n_batches)),
- n_S_per_approach = .N
- ), by = approach]
-
- # Ensures that the number of batches corresponds to `n_batches`
- if (sum(batch_count_dt$n_batches_per_approach) != n_batches) {
- # Ensure that the number of batches is not larger than `n_batches`.
- # Remove one batch from the approach with the most batches.
- while (sum(batch_count_dt$n_batches_per_approach) > n_batches) {
- batch_count_dt[
- which.max(n_batches_per_approach),
- n_batches_per_approach := n_batches_per_approach - 1
- ]
- }
-
- # Ensure that the number of batches is not lower than `n_batches`.
- # Add one batch to the approach with most coalitions per batch
- while (sum(batch_count_dt$n_batches_per_approach) < n_batches) {
- batch_count_dt[
- which.max(n_S_per_approach / n_batches_per_approach),
- n_batches_per_approach := n_batches_per_approach + 1
- ]
- }
- }
-
- batch_count_dt[, n_leftover_first_batch := n_S_per_approach %% n_batches_per_approach]
- data.table::setorder(batch_count_dt, -n_leftover_first_batch)
-
- approach_vec <- batch_count_dt[, approach]
- n_batch_vec <- batch_count_dt[, n_batches_per_approach]
-
- # Randomize order before ordering spreading the batches on the different approaches as evenly as possible
- # with respect to shapley_weight
- X[, randomorder := sample(.N)]
- data.table::setorder(X, randomorder) # To avoid smaller id_combinations always proceeding large ones
- data.table::setorder(X, shapley_weight)
-
- batch_counter <- 0
- for (i in seq_along(approach_vec)) {
- X[approach == approach_vec[i], batch := ceiling(.I / .N * n_batch_vec[i]) + batch_counter]
- batch_counter <- X[approach == approach_vec[i], max(batch)]
- }
- } else {
- X[!(n_features %in% c(0, n_features0)), approach := approach0]
-
- # Spreading the batches
- X[, randomorder := sample(.N)]
- data.table::setorder(X, randomorder)
- data.table::setorder(X, shapley_weight)
- X[!(n_features %in% c(0, n_features0)), batch := ceiling(.I / .N * n_batches)]
- }
-
- # Assigning batch 1 (which always is the smallest) to the full prediction.
- X[, randomorder := NULL]
- X[id_combination == max(id_combination), batch := 1]
- setkey(X, id_combination)
-
- # Create a list of the batch splits
- S_groups <- split(X[id_combination != 1, id_combination], X[id_combination != 1, batch])
-
- return(S_groups)
-}
diff --git a/R/shapley_setup.R b/R/shapley_setup.R
new file mode 100644
index 000000000..29fdb08a4
--- /dev/null
+++ b/R/shapley_setup.R
@@ -0,0 +1,762 @@
+#' Set up the kernelSHAP framework
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+shapley_setup <- function(internal) {
+ verbose <- internal$parameters$verbose
+ n_shapley_values <- internal$parameters$n_shapley_values
+ n_features <- internal$parameters$n_features
+ approach <- internal$parameters$approach
+ is_groupwise <- internal$parameters$is_groupwise
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+ shapley_reweighting <- internal$parameters$shapley_reweighting
+ coal_feature_list <- internal$objects$coal_feature_list
+
+
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ exact <- internal$iter_list[[iter]]$exact
+ prev_coal_samples <- internal$iter_list[[iter]]$prev_coal_samples
+
+ if ("progress" %in% verbose) {
+ cli::cli_progress_step("Sampling coalitions")
+ }
+
+
+ X <- create_coalition_table(
+ m = n_shapley_values,
+ exact = exact,
+ n_coalitions = n_coalitions,
+ weight_zero_m = 10^6,
+ paired_shap_sampling = paired_shap_sampling,
+ prev_coal_samples = prev_coal_samples,
+ coal_feature_list = coal_feature_list,
+ approach0 = approach,
+ shapley_reweighting = shapley_reweighting
+ )
+
+
+
+ coalition_map <- X[, .(id_coalition,
+ coalitions_str = sapply(coalitions, paste, collapse = " ")
+ )]
+
+
+ # Get weighted matrix ----------------
+ W <- weight_matrix(
+ X = X,
+ normalize_W_weights = TRUE
+ )
+
+
+ ## Get feature matrix ---------
+ S <- coalition_matrix_cpp(
+ coalitions = X[["features"]],
+ m = n_features
+ )
+
+ #### Updating parameters ####
+
+ # Updating parameters$exact as done in create_coalition_table. I don't think this is necessary now. TODO: Check.
+ # Moreover, it does not apply to grouping, so must be adjusted anyway.
+ if (!exact && n_coalitions >= 2^n_shapley_values) {
+ internal$iter_list[[iter]]$exact <- TRUE
+ internal$parameters$exact <- TRUE # Since this means that all coalitions have been sampled
+ }
+
+ # Updating n_coalitions in the end based on what is actually used. I don't think this is necessary now. TODO: Check.
+ internal$iter_list[[iter]]$n_coalitions <- nrow(S)
+
+ # This will be obsolete later
+ internal$parameters$group_num <- NULL # TODO: Checking whether I could just do this processing where needed
+ # instead of storing it
+
+
+ if (isFALSE(exact)) {
+ # Storing the feature samples
+ repetitions <- X[-c(1, .N), sample_freq]
+
+ unique_coal_samples <- X[-c(1, .N), coalitions]
+
+ coal_samples <- unlist(
+ lapply(
+ seq_along(unique_coal_samples),
+ function(i) {
+ rep(
+ list(unique_coal_samples[[i]]),
+ repetitions[i]
+ )
+ }
+ ),
+ recursive = FALSE
+ )
+ } else {
+ coal_samples <- NA
+ }
+
+ internal$iter_list[[iter]]$X <- X
+ internal$iter_list[[iter]]$W <- W
+ internal$iter_list[[iter]]$S <- S
+ internal$iter_list[[iter]]$coalition_map <- coalition_map
+ internal$iter_list[[iter]]$S_batch <- create_S_batch(internal)
+ internal$iter_list[[iter]]$coal_samples <- coal_samples
+
+ return(internal)
+}
+
+#' Define coalitions, and fetch additional information about each unique coalition
+#'
+#' @param m Positive integer.
+#' Total number of features/groups.
+#' @param exact Logical.
+#' If `TRUE` all `2^m` coalitions are generated, otherwise a subsample of the coalitions is used.
+#' @param n_coalitions Positive integer.
+#' Note that if `exact = TRUE`, `n_coalitions` is ignored.
+#' @param weight_zero_m Numeric.
+#' The value to use as a replacement for infinite coalition weights when doing numerical operations.
+#' @param paired_shap_sampling Logical.
+#' Whether to do paired sampling of coalitions.
+#' @param prev_coal_samples List.
+#' A list of previously sampled coalitions.
+#' @param approach0 Character vector.
+#' Contains the approach to be used for eastimation of each coalition size. Same as `approach` in `explain()`.
+#' @param coal_feature_list List.
+#' A list mapping each coalition to the features it contains.
+#' @inheritParams explain
+#' @return A data.table with columns about the that contains the following columns:
+#'
+#' @export
+#'
+#' @author Nikolai Sellereite, Martin Jullum
+#'
+#' @examples
+#' # All coalitions
+#' x <- create_coalition_table(m = 3)
+#' nrow(x) # Equals 2^3 = 8
+#'
+#' # Subsample of coalitions
+#' x <- create_coalition_table(exact = FALSE, m = 10, n_coalitions = 1e2)
+create_coalition_table <- function(m, exact = TRUE, n_coalitions = 200, weight_zero_m = 10^6,
+ paired_shap_sampling = TRUE, prev_coal_samples = NULL,
+ coal_feature_list = as.list(seq_len(m)),
+ approach0 = "gaussian",
+ shapley_reweighting = "none") {
+ if (exact) {
+ dt <- exact_coalition_table(m, weight_zero_m)
+ } else {
+ dt <- sample_coalition_table(m,
+ n_coalitions,
+ weight_zero_m,
+ paired_shap_sampling = paired_shap_sampling,
+ prev_coal_samples = prev_coal_samples,
+ shapley_reweighting = shapley_reweighting
+ )
+ stopifnot(
+ data.table::is.data.table(dt),
+ !is.null(dt[["p"]])
+ )
+ p <- NULL # due to NSE notes in R CMD check
+ dt[, p := NULL]
+ }
+
+ dt[, features := lapply(coalitions, FUN = coal_feature_mapper, coal_feature_list = coal_feature_list)]
+
+ # Adding approach to X (needed for the combined approaches)
+ if (length(approach0) > 1) {
+ dt[!(coalition_size %in% c(0, m)), approach := approach0[coalition_size]]
+ } else {
+ dt[, approach := approach0]
+ }
+
+
+ return(dt)
+}
+
+#' @keywords internal
+shapley_reweighting <- function(X, reweight = "on_N") {
+ # Updates the shapley weights in X based on the reweighting strategy BY REFERENCE
+
+
+ if (reweight == "on_N") {
+ X[-c(1, .N), shapley_weight := mean(shapley_weight), by = N]
+ } else if (reweight == "on_coal_size") {
+ X[-c(1, .N), shapley_weight := mean(shapley_weight), by = coalition_size]
+ } else if (reweight == "on_all") {
+ m <- X[.N, coalition_size]
+ X[-c(1, .N), shapley_weight := shapley_weights(
+ m = m,
+ N = N,
+ n_components = coalition_size,
+ weight_zero_m = 10^6
+ ) / sum_shapley_weights(m)]
+ } else if (reweight == "on_N_sum") {
+ X[-c(1, .N), shapley_weight := sum(shapley_weight), by = N]
+ } else if (reweight == "on_all_cond") {
+ m <- X[.N, coalition_size]
+ K <- X[, sum(sample_freq)]
+ X[-c(1, .N), shapley_weight := shapley_weights(
+ m = m,
+ N = N,
+ n_components = coalition_size,
+ weight_zero_m = 10^6
+ ) / sum_shapley_weights(m)]
+ X[-c(1, .N), cond := 1 - (1 - shapley_weight)^K]
+ X[-c(1, .N), shapley_weight := shapley_weight / cond]
+ } else if (reweight == "on_all_cond_paired") {
+ m <- X[.N, coalition_size]
+ K <- X[, sum(sample_freq)]
+ X[-c(1, .N), shapley_weight := shapley_weights(
+ m = m,
+ N = N,
+ n_components = coalition_size,
+ weight_zero_m = 10^6
+ ) / sum_shapley_weights(m)]
+ X[-c(1, .N), cond := 1 - (1 - 2 * shapley_weight)^(K / 2)]
+ X[-c(1, .N), shapley_weight := 2 * shapley_weight / cond]
+ }
+ # strategy= "none" or something else do nothing
+ return(NULL)
+}
+
+
+#' @keywords internal
+exact_coalition_table <- function(m, weight_zero_m = 10^6) {
+ dt <- data.table::data.table(id_coalition = seq(2^m))
+ coalitions0 <- lapply(0:m, utils::combn, x = m, simplify = FALSE)
+ dt[, coalitions := unlist(coalitions0, recursive = FALSE)]
+ dt[, coalition_size := length(coalitions[[1]]), id_coalition]
+ dt[, N := .N, coalition_size]
+ dt[, shapley_weight := shapley_weights(m = m, N = N, n_components = coalition_size, weight_zero_m)]
+ dt[, sample_freq := NA]
+ return(dt)
+}
+
+#' @keywords internal
+sample_coalition_table <- function(m,
+ n_coalitions = 200,
+ weight_zero_m = 10^6,
+ paired_shap_sampling = TRUE,
+ prev_coal_samples = NULL,
+ shapley_reweighting) {
+ # Setup
+ coal_samp_vec <- seq(m - 1)
+ n <- sapply(coal_samp_vec, choose, n = m)
+ w <- shapley_weights(m = m, N = n, coal_samp_vec) * n
+ p <- w / sum(w)
+
+
+ if (!is.null(prev_coal_samples)) {
+ coal_sample_all <- prev_coal_samples
+ unique_samples <- length(unique(prev_coal_samples))
+ n_coalitions <- min(2^m, n_coalitions)
+ # Adjusts for the the unique samples, zero and m samples
+ } else {
+ coal_sample_all <- list()
+ unique_samples <- 0
+ }
+
+ while (unique_samples < n_coalitions - 2) {
+ if (paired_shap_sampling == TRUE) {
+ n_samps <- ceiling((n_coalitions - unique_samples - 2) / 2) # Sample -2 as we add zero and m samples below
+ } else {
+ n_samps <- n_coalitions - unique_samples - 2 # Sample -2 as we add zero and m samples below
+ }
+
+ # Sample the coalition size ----------
+ coal_size_sample <- sample(
+ x = coal_samp_vec,
+ size = n_samps,
+ replace = TRUE,
+ prob = p
+ )
+
+ # Sample specific coalitions -------
+ coal_sample <- sample_features_cpp(m, coal_size_sample)
+ if (paired_shap_sampling == TRUE) {
+ coal_sample_paired <- lapply(coal_sample, function(x) seq(m)[-x])
+ coal_sample_all <- c(coal_sample_all, coal_sample, coal_sample_paired)
+ } else {
+ coal_sample_all <- c(coal_sample_all, coal_sample)
+ }
+ unique_samples <- length(unique(coal_sample_all))
+ }
+
+ # Add zero and full prediction
+ coal_sample_all <- c(list(integer(0)), coal_sample_all, list(c(1:m)))
+ X <- data.table(coalition_size = sapply(coal_sample_all, length))
+ X[, coalition_size := as.integer(coalition_size)]
+
+ # Get number of occurences and duplicated rows-------
+ is_duplicate <- NULL # due to NSE notes in R CMD check
+ r <- helper_feature(m, coal_sample_all)
+ X[, is_duplicate := r[["is_duplicate"]]]
+
+ # When we sample coalitions the Shapley weight is equal
+ # to the frequency of the given coalition
+ X[, sample_freq := r[["sample_frequence"]]] # We keep an unscaled version of the sampling frequency for bootstrapping
+ X[, shapley_weight := as.numeric(sample_freq)] # Convert to double for later calculations
+
+ # Populate table and remove duplicated rows -------
+ X[, coalitions := coal_sample_all]
+ if (any(X[["is_duplicate"]])) {
+ X <- X[is_duplicate == FALSE]
+ }
+ X[, is_duplicate := NULL]
+ data.table::setkeyv(X, "coalition_size")
+
+
+ #### TODO: Check if this could be removed: ####
+ ### Start of possible removal ###
+ # Make feature list into character
+ X[, coalitions_tmp := sapply(coalitions, paste, collapse = " ")]
+
+ # Aggregate weights by how many samples of a coalition we observe
+ X <- X[, .(
+ coalition_size = data.table::first(coalition_size),
+ shapley_weight = sum(shapley_weight),
+ sample_freq = sum(sample_freq),
+ coalitions = coalitions[1]
+ ), coalitions_tmp]
+
+ X[, coalitions_tmp := NULL]
+ #### End of possible removal ####
+
+ data.table::setorder(X, coalition_size)
+
+ # Add shapley weight and number of coalitions
+ X[c(1, .N), shapley_weight := weight_zero_m]
+ X[, N := 1]
+ ind <- X[, .I[data.table::between(coalition_size, 1, m - 1)]]
+ X[ind, p := p[coalition_size]]
+ X[ind, N := n[coalition_size]]
+
+ # Set column order and key table
+ data.table::setkeyv(X, "coalition_size")
+ X[, id_coalition := .I]
+ X[, N := as.integer(N)]
+ nms <- c("id_coalition", "coalitions", "coalition_size", "N", "shapley_weight", "p", "sample_freq")
+ data.table::setcolorder(X, nms)
+
+ shapley_reweighting(X, reweight = shapley_reweighting) # Reweights the shapley weights in X by reference
+
+ return(X)
+}
+
+
+#' Calculate Shapley weight
+#'
+#' @param m Positive integer. Total number of features/feature groups.
+#' @param n_components Positive integer. Represents the number of features/feature groups you want to sample from
+#' a feature space consisting of `m` unique features/feature groups. Note that ` 0 < = n_components <= m`.
+#' @param N Positive integer. The number of unique coalitions when sampling `n_components` features/feature
+#' groups, without replacement, from a sample space consisting of `m` different features/feature groups.
+#' @param weight_zero_m Positive integer. Represents the Shapley weight for two special
+#' cases, i.e. the case where you have either `0` or `m` features/feature groups.
+#'
+#' @return Numeric
+#' @keywords internal
+#'
+#' @author Nikolai Sellereite
+shapley_weights <- function(m, N, n_components, weight_zero_m = 10^6) {
+ x <- (m - 1) / (N * n_components * (m - n_components))
+ x[!is.finite(x)] <- weight_zero_m
+ x
+}
+
+#' @keywords internal
+sum_shapley_weights <- function(m) {
+ coal_samp_vec <- seq(m - 1)
+ n <- sapply(coal_samp_vec, choose, n = m)
+ w <- shapley_weights(m = m, N = n, coal_samp_vec) * n
+ return(sum(w))
+}
+
+
+#' @keywords internal
+helper_feature <- function(m, coal_sample) {
+ x <- coalition_matrix_cpp(coal_sample, m)
+ dt <- data.table::data.table(x)
+ cnms <- paste0("V", seq(m))
+ data.table::setnames(dt, cnms)
+ dt[, sample_frequence := as.integer(.N), by = cnms]
+ dt[, is_duplicate := duplicated(dt)]
+ dt[, (cnms) := NULL]
+
+ return(dt)
+}
+
+
+
+
+#' @keywords internal
+coal_feature_mapper <- function(x, coal_feature_list) {
+ if (length(x) != 0) {
+ unlist(coal_feature_list[x])
+ } else {
+ integer(0)
+ }
+}
+
+#' Calculate weighted matrix
+#'
+#' @param X data.table
+#' @param normalize_W_weights Logical. Whether to normalize the weights for the coalitions to sum to 1 for
+#' increased numerical stability before solving the WLS (weighted least squares). Applies to all coalitions
+#' except coalition `1` and `2^m`.
+#'
+#' @return Numeric matrix. See [weight_matrix_cpp()] for more information.
+#' @keywords internal
+#'
+#' @export
+#' @author Nikolai Sellereite, Martin Jullum
+weight_matrix <- function(X, normalize_W_weights = TRUE) {
+ # Fetch weights
+ w <- X[["shapley_weight"]]
+
+ if (normalize_W_weights) {
+ w[-c(1, length(w))] <- w[-c(1, length(w))] / sum(w[-c(1, length(w))])
+ }
+
+ W <- weight_matrix_cpp(
+ coalitions = X[["coalitions"]],
+ m = X[.N][["coalition_size"]],
+ n = X[, .N],
+ w = w
+ )
+ return(W)
+}
+
+#' @keywords internal
+create_S_batch_forecast <- function(internal, seed = NULL) { # This is temporary used for forecast only. to be removed
+ n_shapley_values <- internal$parameters$n_shapley_values
+ approach0 <- internal$parameters$approach
+ n_batches <- 10 # TODO: fix this! similartly to how it is done in non-forecast
+
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+
+ X <- internal$objects$X
+
+ if (!is.null(seed)) set.seed(seed)
+
+ if (length(approach0) > 1) {
+ X[!(n_coalitions %in% c(0, n_shapley_values)), approach := approach0[n_coalitions]]
+
+ # Finding the number of batches per approach
+ batch_count_dt <- X[!is.na(approach), list(
+ n_batches_per_approach =
+ pmax(1, round(.N / (n_coalitions - 2) * n_batches)),
+ n_S_per_approach = .N
+ ), by = approach]
+
+ # Ensures that the number of batches corresponds to `n_batches`
+ if (sum(batch_count_dt$n_batches_per_approach) != n_batches) {
+ # Ensure that the number of batches is not larger than `n_batches`.
+ # Remove one batch from the approach with the most batches.
+ while (sum(batch_count_dt$n_batches_per_approach) > n_batches) {
+ batch_count_dt[
+ which.max(n_batches_per_approach),
+ n_batches_per_approach := n_batches_per_approach - 1
+ ]
+ }
+
+ # Ensure that the number of batches is not lower than `n_batches`.
+ # Add one batch to the approach with most coalitions per batch
+ while (sum(batch_count_dt$n_batches_per_approach) < n_batches) {
+ batch_count_dt[
+ which.max(n_S_per_approach / n_batches_per_approach),
+ n_batches_per_approach := n_batches_per_approach + 1
+ ]
+ }
+ }
+
+ batch_count_dt[, n_leftover_first_batch := n_S_per_approach %% n_batches_per_approach]
+ data.table::setorder(batch_count_dt, -n_leftover_first_batch)
+
+ approach_vec <- batch_count_dt[, approach]
+ n_batch_vec <- batch_count_dt[, n_batches_per_approach]
+
+ # Randomize order before ordering spreading the batches on the different approaches as evenly as possible
+ # with respect to shapley_weight
+ X[, randomorder := sample(.N)]
+ data.table::setorder(X, randomorder) # To avoid smaller id_coalitions always proceeding large ones
+ data.table::setorder(X, shapley_weight)
+
+ batch_counter <- 0
+ for (i in seq_along(approach_vec)) {
+ X[approach == approach_vec[i], batch := ceiling(.I / .N * n_batch_vec[i]) + batch_counter]
+ batch_counter <- X[approach == approach_vec[i], max(batch)]
+ }
+ } else {
+ X[!(n_coalitions %in% c(0, n_shapley_values)), approach := approach0]
+
+ # Spreading the batches
+ X[, randomorder := sample(.N)]
+ data.table::setorder(X, randomorder)
+ data.table::setorder(X, shapley_weight)
+ X[!(coalition_size %in% c(0, n_shapley_values)), batch := ceiling(.I / .N * n_batches)]
+ }
+
+ # Assigning batch 1 (which always is the smallest) to the full prediction.
+ X[, randomorder := NULL]
+ X[id_coalition == max(id_coalition), batch := 1]
+ setkey(X, id_coalition)
+
+ # Create a list of the batch splits
+ S_groups <- split(X[id_coalition != 1, id_coalition], X[id_coalition != 1, batch])
+
+ return(S_groups)
+}
+
+#' @keywords internal
+create_S_batch <- function(internal, seed = NULL) {
+ n_shapley_values <- internal$parameters$n_shapley_values
+ approach0 <- internal$parameters$approach
+
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ n_batches <- internal$iter_list[[iter]]$n_batches
+
+ exact <- internal$iter_list[[iter]]$exact
+
+
+ coalition_map <- internal$iter_list[[iter]]$coalition_map
+
+
+ X0 <- copy(internal$iter_list[[iter]]$X)
+
+ if (iter > 1) {
+ prev_coalition_map <- internal$iter_list[[iter - 1]]$coalition_map
+ new_id_coalitions <- coalition_map[
+ !(coalitions_str %in% prev_coalition_map[-c(1, .N), coalitions_str, ]),
+ id_coalition
+ ]
+ X0 <- X0[id_coalition %in% new_id_coalitions]
+ }
+
+ # Reduces n_batches if it is larger than the number of new_id_coalitions
+ n_batches <- min(n_batches, X0[, .N] - 2)
+
+
+ if (!is.null(seed)) set.seed(seed)
+
+ if (length(approach0) > 1) {
+ X0[!(coalition_size %in% c(0, n_shapley_values)), approach := approach0[coalition_size]]
+
+ # Finding the number of batches per approach
+ batch_count_dt <- X0[!is.na(approach), list(
+ n_batches_per_approach =
+ pmax(1, round(.N / (n_coalitions - 2) * n_batches)),
+ n_S_per_approach = .N
+ ), by = approach]
+
+ # Ensures that the number of batches corresponds to `n_batches`
+ if (sum(batch_count_dt$n_batches_per_approach) != n_batches) {
+ # Ensure that the number of batches is not larger than `n_batches`.
+ # Remove one batch from the approach with the most batches.
+ while (sum(batch_count_dt$n_batches_per_approach) > n_batches) {
+ batch_count_dt[
+ which.max(n_batches_per_approach),
+ n_batches_per_approach := n_batches_per_approach - 1
+ ]
+ }
+
+ # Ensure that the number of batches is not lower than `n_batches`.
+ # Add one batch to the approach with most coalitions per batch
+ while (sum(batch_count_dt$n_batches_per_approach) < n_batches) {
+ batch_count_dt[
+ which.max(n_S_per_approach / n_batches_per_approach),
+ n_batches_per_approach := n_batches_per_approach + 1
+ ]
+ }
+ }
+
+ batch_count_dt[, n_leftover_first_batch := n_S_per_approach %% n_batches_per_approach]
+ data.table::setorder(batch_count_dt, -n_leftover_first_batch)
+
+ approach_vec <- batch_count_dt[, approach]
+ n_batch_vec <- batch_count_dt[, n_batches_per_approach]
+
+ # Randomize order before ordering spreading the batches on the different approaches as evenly as possible
+ # with respect to shapley_weight
+ X0[, randomorder := sample(.N)]
+ data.table::setorder(X0, randomorder) # To avoid smaller id_coalitions always proceeding large ones
+ data.table::setorder(X0, shapley_weight)
+
+ batch_counter <- 0
+ for (i in seq_along(approach_vec)) {
+ X0[approach == approach_vec[i], batch := ceiling(.I / .N * n_batch_vec[i]) + batch_counter]
+ batch_counter <- X0[approach == approach_vec[i], max(batch)]
+ }
+ } else {
+ X0[!(coalition_size %in% c(0, n_shapley_values)), approach := approach0]
+
+ # Spreading the batches
+ X0[, randomorder := sample(.N)]
+ data.table::setorder(X0, randomorder)
+ data.table::setorder(X0, shapley_weight)
+ X0[!(coalition_size %in% c(0, n_shapley_values)), batch := ceiling(.I / .N * n_batches)]
+ }
+
+ # Assigning batch 1 (which always is the smallest) to the full prediction.
+ X0[, randomorder := NULL]
+ X0[id_coalition == max(id_coalition), batch := 1]
+ setkey(X0, id_coalition)
+
+ # Create a list of the batch splits
+ S_groups <- split(X0[id_coalition != 1, id_coalition], X0[id_coalition != 1, batch])
+
+ return(S_groups)
+}
+
+
+#' Sets up everything for the Shapley values computation in [shapr::explain()]
+#'
+#' @inheritParams default_doc
+#' @inheritParams explain
+#' @inherit default_doc
+#' @export
+setup_computation <- function(internal, model, predict_model) {
+ # model and predict_model are only needed for type AICc of approach empirical, otherwise ignored
+ type <- internal$parameters$type
+
+ # setup the Shapley framework
+ internal <- if (type == "forecast") shapley_setup_forecast(internal) else shapley_setup(internal)
+
+ # Setup for approach
+ internal <- setup_approach(internal, model = model, predict_model = predict_model)
+
+ return(internal)
+}
+
+#' @keywords internal
+shapley_setup_forecast <- function(internal) {
+ n_shapley_values <- internal$parameters$n_shapley_values
+ n_features <- internal$parameters$n_features
+ approach <- internal$parameters$approach
+ is_groupwise <- internal$parameters$is_groupwise
+ paired_shap_sampling <- internal$parameters$paired_shap_sampling
+ shapley_reweighting <- internal$parameters$shapley_reweighting
+
+ coal_feature_list <- internal$objects$coal_feature_list
+ horizon <- internal$parameters$horizon
+ feature_names <- internal$parameters$feature_names
+
+ iter <- length(internal$iter_list)
+
+ n_coalitions <- internal$iter_list[[iter]]$n_coalitions
+ exact <- internal$iter_list[[iter]]$exact
+ prev_coal_samples <- internal$iter_list[[iter]]$prev_coal_samples
+
+ X_list <- W_list <- list()
+
+ # Find columns to be included in each of the different horizons
+ col_del_list <- list()
+ col_del_list[[1]] <- numeric()
+ if (horizon > 1) {
+ k <- 2
+ for (i in rev(seq_len(horizon)[-1])) {
+ col_del_list[[k]] <- c(unlist(col_del_list[[k - 1]]), grep(paste0(".F", i), feature_names))
+ k <- k + 1
+ }
+ }
+
+ cols_per_horizon <- lapply(rev(col_del_list), function(x) if (length(x) > 0) feature_names[-x] else feature_names)
+
+ horizon_features <- lapply(cols_per_horizon, function(x) which(internal$parameters$feature_names %in% x))
+
+ # Apply create_coalition_table, weigth_matrix and coalition_matrix_cpp to each of the different horizons
+ for (i in seq_along(horizon_features)) {
+ # TODO: Somethis is not correct in these next 20 lines of code. Something was messed up after
+ # Removing the group stuff and generalizing to coalitions.
+
+ this_featcomb <- horizon_features[[i]]
+
+ this_coal_feature_list <- lapply(coal_feature_list, function(x) x[x %in% this_featcomb])
+ this_coal_feature_list <- this_coal_feature_list[sapply(this_coal_feature_list, function(x) length(x) != 0)]
+
+ n_this_featcomb <- length(this_coal_feature_list)
+
+
+ n_coalitions_here <- min(2^n_this_featcomb, n_coalitions)
+ exact_here <- ifelse(n_coalitions_here == 2^n_this_featcomb, TRUE, exact)
+
+
+ X_list[[i]] <- create_coalition_table(
+ m = n_this_featcomb,
+ exact = exact_here,
+ n_coalitions = n_coalitions_here,
+ weight_zero_m = 10^6,
+ paired_shap_sampling = paired_shap_sampling,
+ prev_coal_samples = prev_coal_samples,
+ coal_feature_list = this_coal_feature_list,
+ approach0 = approach
+ )
+
+
+ W_list[[i]] <- weight_matrix(
+ X = X_list[[i]],
+ normalize_W_weights = TRUE
+ )
+ }
+
+ # Merge the coalition data.table to single one to use for computing conditional expectations later on
+ X <- rbindlist(X_list, idcol = "horizon")
+ X[, N := NA]
+ X[, shapley_weight := NA]
+ data.table::setorderv(X, c("coalition_size", "horizon"), order = c(1, -1))
+ X[, horizon_id_coalition := id_coalition]
+ X[, id_coalition := 0]
+ X[!duplicated(coalitions), id_coalition := .I]
+ X[, tmp_coalitions := as.character(coalitions)]
+ X[, id_coalition := max(id_coalition), by = tmp_coalitions]
+ X[, tmp_coalitions := NULL]
+
+ # Extracts a data.table allowing mapping from X to X_list/W_list to be used in the compute_shapley function
+ id_coalition_mapper_dt <- X[, .(horizon, horizon_id_coalition, id_coalition)]
+
+ X[, horizon := NULL]
+ X[, horizon_id_coalition := NULL]
+ data.table::setorder(X, coalition_size)
+ X <- X[!duplicated(id_coalition)]
+
+ W <- NULL # Included for consistency. Necessary weights are in W_list instead
+
+ ## Get feature matrix ---------
+ S <- coalition_matrix_cpp(
+ coalitions = X[["features"]],
+ m = n_features
+ )
+
+
+ #### Updating parameters ####
+
+ # Updating parameters$exact as done in create_coalition_table
+ if (!exact && n_coalitions >= 2^n_shapley_values) {
+ internal$parameters$exact <- TRUE # Note that this is exact only if all horizons use the exact method.
+ }
+
+ internal$iter_list[[iter]]$n_coalitions <- nrow(S) # Updating this parameter in the end based on what is used.
+
+ # This will be obsolete later
+ internal$parameters$group_num <- NULL # TODO: Checking whether I could just do this processing where needed
+ # instead of storing it
+
+ internal$objects$X <- X
+ internal$objects$W <- W
+ internal$objects$S <- S
+ internal$objects$S_batch <- create_S_batch_forecast(internal)
+
+ internal$objects$id_coalition_mapper_dt <- id_coalition_mapper_dt
+ internal$objects$cols_per_horizon <- cols_per_horizon
+ internal$objects$W_list <- W_list
+ internal$objects$X_list <- X_list
+
+
+ return(internal)
+}
diff --git a/R/shapr-package.R b/R/shapr-package.R
index fd368e8b4..fc618b593 100644
--- a/R/shapr-package.R
+++ b/R/shapr-package.R
@@ -25,8 +25,12 @@
#'
#' @importFrom stats rnorm
#'
+#' @importFrom stats median
+#'
#' @importFrom Rcpp sourceCpp
#'
+#' @importFrom utils capture.output
+#'
#' @keywords internal
#'
#' @useDynLib shapr, .registration = TRUE
diff --git a/R/timing.R b/R/timing.R
index b5ac27c95..5dc129dbd 100644
--- a/R/timing.R
+++ b/R/timing.R
@@ -1,16 +1,50 @@
-compute_time <- function(timing_list) {
- timing_secs <- mapply(
+#' Gathers and computes the timing of the different parts of the explain function.
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+#' @keywords internal
+compute_time <- function(internal) {
+ verbose <- internal$parameters$verbose
+
+ main_timing_list <- internal$main_timing_list
+ iter_timing_list <- internal$iter_timing_list
+
+
+ main_timing_secs <- mapply(
FUN = difftime,
- timing_list[-1],
- timing_list[-length(timing_list)],
+ main_timing_list[-1],
+ main_timing_list[-length(main_timing_list)],
units = "secs"
)
+ iter_timing_secs_list <- list()
+ for (i in seq_along(iter_timing_list)) {
+ iter_timing_secs_list[[i]] <- as.list(mapply(
+ FUN = difftime,
+ iter_timing_list[[i]][-1],
+ iter_timing_list[[i]][-length(iter_timing_list[[i]])],
+ units = "secs"
+ ))
+ }
+ iter_timing_secs_dt <- data.table::rbindlist(iter_timing_secs_list)
+ iter_timing_secs_dt[, total := rowSums(.SD)]
+ iter_timing_secs_dt[, iter := .I]
+ data.table::setcolorder(iter_timing_secs_dt, "iter")
+
+ total_time_secs <- main_timing_list[[length(main_timing_list)]] - main_timing_list[[1]]
+
+
timing_output <- list(
- init_time = timing_list$init,
- total_time_secs = sum(timing_secs),
- timing_secs = timing_secs
+ init_time = main_timing_list[[1]],
+ end_time = main_timing_list[[length(main_timing_list)]],
+ total_time_secs = total_time_secs,
+ overall_timing_secs = main_timing_secs,
+ main_computation_timing_secs = iter_timing_secs_dt[]
)
+ internal$main_timing_list <- internal$iter_timing_list <- NULL
+
+
return(timing_output)
}
diff --git a/R/zzz.R b/R/zzz.R
index d021bb329..bf72029c2 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -11,7 +11,7 @@
"N",
"id_all",
"id",
- "id_combination",
+ "id_coalition",
"w",
"id_all",
"joint_prob",
@@ -77,7 +77,7 @@
"batch",
"type",
"feature_value_factor",
- "horizon_id_combination",
+ "horizon_id_coalition",
"tmp_features",
"Method",
"MSEv",
@@ -107,7 +107,6 @@
"x_train_torch",
"self",
"..current_comb",
- "..regression.response_var" # test
)
)
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
index ccce694e2..07c5a150f 100644
--- a/inst/REFERENCES.bib
+++ b/inst/REFERENCES.bib
@@ -190,3 +190,12 @@ @Manual{tidymodels
year = {2020},
}
+@inproceedings{covert2021improving,
+ title={Improving kernelshap: Practical shapley value estimation using linear regression},
+ author={Covert, Ian and Lee, Su-In},
+ booktitle={International Conference on Artificial Intelligence and Statistics},
+ pages={3457--3465},
+ year={2021},
+ organization={PMLR}
+}
+
diff --git a/inst/scripts/analyze_bash_test_data.R b/inst/scripts/analyze_bash_test_data.R
index 519801de3..3cd9435e4 100644
--- a/inst/scripts/analyze_bash_test_data.R
+++ b/inst/scripts/analyze_bash_test_data.R
@@ -52,10 +52,10 @@ dt_time0 <- fread("inst/scripts/timing_test_2023_new2.csv")
dt_time0[,n_batches_real:=pmin(2^p-2,n_batches)]
-dt_time <- dt_time0[,.(time,secs_explain,timing_setup,timing_test_prediction, timing_setup_computation ,timing_compute_vS ,timing_postprocessing ,timing_shapley_computation, rep,p,n_train,n_explain,n_batches_real,approach,n_combinations)]
+dt_time <- dt_time0[,.(time,secs_explain,timing_setup,timing_test_prediction, timing_setup_computation ,timing_compute_vS ,timing_postprocessing ,timing_shapley_computation, rep,p,n_train,n_explain,n_batches_real,approach,n_coalitions)]
dt_time[n_batches_real==1,secs_explain_singlebatch :=secs_explain]
-dt_time[,secs_explain_singlebatch:=mean(secs_explain_singlebatch,na.rm=T),by=.(p,n_train,n_explain,approach,n_combinations)]
+dt_time[,secs_explain_singlebatch:=mean(secs_explain_singlebatch,na.rm=T),by=.(p,n_train,n_explain,approach,n_coalitions)]
dt_time[,secs_explain_prop_singlebatch:=secs_explain/secs_explain_singlebatch]
ggplot(dt_time[p<14],aes(x=n_batches_real,y=secs_explain,col=as.factor(n_explain),linetype=as.factor(n_train)))+
@@ -101,14 +101,14 @@ ggplot(dt_time[p<16& p>2 & approach=="empirical"],aes(x=n_batches_real,y=secs_ex
# max 100, min 10
n_batches_fun <- function(approach,p){
- n_combinations <- 2^p-2
+ n_coalitions <- 2^p-2
if(approach %in% c("ctree","gaussian","copula")){
- init <- ceiling(n_combinations/10)
+ init <- ceiling(n_coalitions/10)
floor <- max(c(10,init))
ret <- min(c(1000,floor))
} else {
- init <- ceiling(n_combinations/100)
+ init <- ceiling(n_coalitions/100)
floor <- max(c(2,init))
ret <- min(c(100,floor))
}
diff --git a/inst/scripts/compare_copula_in_R_and_C++.R b/inst/scripts/compare_copula_in_R_and_C++.R
index fd6b1cfb4..d0698dbf2 100644
--- a/inst/scripts/compare_copula_in_R_and_C++.R
+++ b/inst/scripts/compare_copula_in_R_and_C++.R
@@ -41,10 +41,10 @@ prepare_data.copula_old <- function(internal, index_features = NULL, ...) {
x_train = as.matrix(x_train),
x_explain_gaussian = as.matrix(copula.x_explain_gaussian)[i, , drop = FALSE]
)
- dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
+ dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_coalition")
dt_l[[i]][, w := 1 / n_samples]
dt_l[[i]][, id := i]
- if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
+ if (!is.null(index_features)) dt_l[[i]][, id_coalition := index_features[id_coalition]]
}
dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
@@ -171,7 +171,7 @@ prepare_data.copula_cpp_arma <- function(internal, index_features, ...) {
n_explain <- internal$parameters$n_explain
n_samples <- internal$parameters$n_samples
n_features <- internal$parameters$n_features
- n_combinations_now <- length(index_features)
+ n_coalitions_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
@@ -199,16 +199,16 @@ prepare_data.copula_cpp_arma <- function(internal, index_features, ...) {
)
# Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
- dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
+ dim(dt) <- c(n_coalitions_now * n_explain * n_samples, n_features)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
- dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
+ dt[, id_coalition := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
dt[, w := 1 / n_samples]
- dt[, id_combination := index_features[id_combination]]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ dt[, id_coalition := index_features[id_coalition]]
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
return(dt)
}
@@ -229,7 +229,7 @@ prepare_data.copula_cpp_and_R <- function(internal, index_features, ...) {
n_explain <- internal$parameters$n_explain
n_samples <- internal$parameters$n_samples
n_features <- internal$parameters$n_features
- n_combinations_now <- length(index_features)
+ n_coalitions_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
@@ -257,16 +257,16 @@ prepare_data.copula_cpp_and_R <- function(internal, index_features, ...) {
)
# Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
- dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
+ dim(dt) <- c(n_coalitions_now * n_explain * n_samples, n_features)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
- dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
+ dt[, id_coalition := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
dt[, w := 1 / n_samples]
- dt[, id_combination := index_features[id_combination]]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ dt[, id_coalition := index_features[id_coalition]]
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
return(dt)
}
@@ -327,7 +327,7 @@ prepare_data.copula_sourceCpp <- function(internal, index_features, ...) {
n_explain <- internal$parameters$n_explain
n_samples <- internal$parameters$n_samples
n_features <- internal$parameters$n_features
- n_combinations_now <- length(index_features)
+ n_coalitions_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
@@ -351,16 +351,16 @@ prepare_data.copula_sourceCpp <- function(internal, index_features, ...) {
)
# Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
- dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
+ dim(dt) <- c(n_coalitions_now * n_explain * n_samples, n_features)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
- dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
+ dt[, id_coalition := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
dt[, w := 1 / n_samples]
- dt[, id_combination := index_features[id_combination]]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ dt[, id_coalition := index_features[id_coalition]]
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
return(dt)
}
@@ -444,7 +444,7 @@ using namespace Rcpp;
// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
// transformed to a standardized normal distribution.
// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
-// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+// @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
// This is not a problem internally in shapr as the empty and grand coalitions treated differently.
// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
@@ -642,7 +642,7 @@ arma::mat inv_gaussian_transform_cpp_arma(arma::mat z, arma::mat x) {
// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
// transformed to a standardized normal distribution.
// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
-// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+// @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
// This is not a problem internally in shapr as the empty and grand coalitions treated differently.
// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
@@ -747,7 +747,7 @@ arma::cube prepare_data_copula_cpp_arma(arma::mat MC_samples_mat,
// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
// transformed to a standardized normal distribution.
// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
-// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+// @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
// This is not a problem internally in shapr as the empty and grand coalitions treated differently.
// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
@@ -915,7 +915,7 @@ arma::cube prepare_data_copula_cpp_and_R(arma::mat MC_samples_mat,
predict_model <- NULL
get_model_specs <- NULL
timing <- TRUE
- n_combinations <- NULL
+ n_coalitions <- NULL
group <- NULL
feature_specs <- shapr:::get_feature_specs(get_model_specs, model)
n_batches <- 1
@@ -926,7 +926,7 @@ arma::cube prepare_data_copula_cpp_and_R(arma::mat MC_samples_mat,
x_explain = x_explain,
approach = approach,
prediction_zero = prediction_zero,
- n_combinations = n_combinations,
+ n_coalitions = n_coalitions,
group = group,
n_samples = n_samples,
n_batches = n_batches,
@@ -959,7 +959,7 @@ feature_names <- internal$parameters$feature_names
n_explain <- internal$parameters$n_explain
n_samples <- internal$parameters$n_samples
n_features <- internal$parameters$n_features
-n_combinations_now <- length(index_features)
+n_coalitions_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
@@ -1060,7 +1060,7 @@ time_only_cpp <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_only_cpp, c("id", "id_combination"))
+data.table::setorderv(res_only_cpp, c("id", "id_coalition"))
time_only_cpp
# The C++ code with my own quantile function
@@ -1070,7 +1070,7 @@ time_only_cpp_sourceCpp <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_combination"))
+data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_coalition"))
time_only_cpp_sourceCpp
# The C++ code with quantile functions from arma
@@ -1080,7 +1080,7 @@ time_only_cpp_arma <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_only_cpp_arma, c("id", "id_combination"))
+data.table::setorderv(res_only_cpp_arma, c("id", "id_coalition"))
time_only_cpp_arma
# The new C++ code with quantile from R
@@ -1090,7 +1090,7 @@ time_cpp_and_R <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_cpp_and_R, c("id", "id_combination"))
+data.table::setorderv(res_cpp_and_R, c("id", "id_coalition"))
time_cpp_and_R
# Create a table of the times. Less is better
@@ -1131,11 +1131,11 @@ res_only_cpp <- res_only_cpp[, w := NULL]
res_only_cpp_sourceCpp <- res_only_cpp_sourceCpp[, w := NULL]
res_only_cpp_arma <- res_only_cpp_arma[, w := NULL]
res_cpp_and_R <- res_cpp_and_R[, w := NULL]
-res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_only_cpp_arma_agr <- res_only_cpp_arma[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_cpp_and_R_agr <- res_cpp_and_R[, lapply(.SD, mean), by = c("id", "id_combination")]
+res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_only_cpp_arma_agr <- res_only_cpp_arma[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_cpp_and_R_agr <- res_cpp_and_R[, lapply(.SD, mean), by = c("id", "id_coalition")]
# Difference
res_only_R_agr - res_only_cpp_agr
@@ -1409,7 +1409,7 @@ all.equal(shapr_mat_arma_res, sourceCpp_mat_arma_res)
predict_model <- NULL
get_model_specs <- NULL
timing <- TRUE
- n_combinations <- NULL
+ n_coalitions <- NULL
group <- NULL
feature_specs <- shapr:::get_feature_specs(get_model_specs, model)
n_batches <- 1
@@ -1420,7 +1420,7 @@ all.equal(shapr_mat_arma_res, sourceCpp_mat_arma_res)
x_explain = x_explain,
approach = approach,
prediction_zero = prediction_zero,
- n_combinations = n_combinations,
+ n_coalitions = n_coalitions,
group = group,
n_samples = n_samples,
n_batches = n_batches,
@@ -1464,7 +1464,7 @@ time_only_cpp <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_only_cpp, c("id", "id_combination"))
+data.table::setorderv(res_only_cpp, c("id", "id_coalition"))
time_only_cpp
# The C++ code with my own quantile function
@@ -1474,7 +1474,7 @@ time_only_cpp_sourceCpp <- system.time({
index_features = internal$objects$S_batch$`1`[look_at_coalitions]
)
})
-data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_combination"))
+data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_coalition"))
time_only_cpp_sourceCpp
# Look at the differences
@@ -1482,9 +1482,9 @@ time_only_cpp_sourceCpp
# res_only_R <- res_only_R[, w := NULL]
# res_only_cpp <- res_only_cpp[, w := NULL]
# res_only_cpp_sourceCpp <- res_only_cpp_sourceCpp[, w := NULL]
-res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_combination")]
-res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_combination")]
+res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_coalition")]
+res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_coalition")]
# Difference
res_only_R_agr - res_only_cpp_agr
diff --git a/inst/scripts/compare_gaussian_in_R_and_C++.R b/inst/scripts/compare_gaussian_in_R_and_C++.R
index b9ca398aa..0106109a6 100644
--- a/inst/scripts/compare_gaussian_in_R_and_C++.R
+++ b/inst/scripts/compare_gaussian_in_R_and_C++.R
@@ -63,7 +63,7 @@ sample_gaussian <- function(index_given, n_samples, mu, cov_mat, m, x_explain) {
# //' univariate standard normal.
# //' @param x_explain_mat matrix. Matrix of dimension `n_explain` times `n_features` containing the observations
# //' to explain.
-# //' @param S matrix. Matrix of dimension `n_combinations` times `n_features` containing binary representations of
+# //' @param S matrix. Matrix of dimension `n_coalitions` times `n_features` containing binary representations of
# //' the used coalitions.
# //' @param mu vector. Vector of length `n_features` containing the mean of each feature.
# //' @param cov_mat mat. Matrix of dimension `n_features` times `n_features` containing the pariwise covariance between
@@ -72,7 +72,7 @@ sample_gaussian <- function(index_given, n_samples, mu, cov_mat, m, x_explain) {
# //' @export
# //' @keywords internal
# //'
-# //' @return List of length `n_combinations`*`n_samples`, where each entry is a matrix of dimension `n_samples` times
+# //' @return List of length `n_coalitions`*`n_samples`, where each entry is a matrix of dimension `n_samples` times
# //' `n_features` containing the conditional MC samples for each coalition and explicand.
# //' @author Lars Henry Berge Olsen
# // [[Rcpp::export]]
@@ -728,10 +728,10 @@ prepare_data_gaussian_old <- function(internal, index_features = NULL, ...) {
x_explain = x_explain0[i, , drop = FALSE]
)
- dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
+ dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_coalition")
dt_l[[i]][, w := 1 / n_samples]
dt_l[[i]][, id := i]
- if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
+ if (!is.null(index_features)) dt_l[[i]][, id_coalition := index_features[id_coalition]]
}
dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
@@ -756,7 +756,7 @@ prepare_data_gaussian_new_v1 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -840,18 +840,18 @@ prepare_data_gaussian_new_v1 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -875,7 +875,7 @@ prepare_data_gaussian_new_v2 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -962,18 +962,18 @@ prepare_data_gaussian_new_v2 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -997,7 +997,7 @@ prepare_data_gaussian_new_v3 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1090,18 +1090,18 @@ prepare_data_gaussian_new_v3 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1124,7 +1124,7 @@ prepare_data_gaussian_new_v4 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1213,18 +1213,18 @@ prepare_data_gaussian_new_v4 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1248,7 +1248,7 @@ prepare_data_gaussian_new_v5 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1338,18 +1338,18 @@ prepare_data_gaussian_new_v5 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1371,7 +1371,7 @@ prepare_data_gaussian_new_v5_rnorm <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1467,18 +1467,18 @@ prepare_data_gaussian_new_v5_rnorm <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1500,7 +1500,7 @@ prepare_data_gaussian_new_v5_rnorm_v2 <- function(internal, index_features, ...)
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1593,18 +1593,18 @@ prepare_data_gaussian_new_v5_rnorm_v2 <- function(internal, index_features, ...)
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1628,7 +1628,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp <- function(internal, index_features, ...
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1647,19 +1647,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp <- function(internal, index_features, ...
dt = as.data.table(do.call(rbind, result_list))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1681,7 +1681,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_with_wrap <- function(internal, index_fea
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1700,19 +1700,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_with_wrap <- function(internal, index_fea
dt = as.data.table(do.call(rbind, result_list))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1735,7 +1735,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_v2 <- function(internal, index_features,
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1754,19 +1754,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_v2 <- function(internal, index_features,
dt = as.data.table(do.call(rbind, result_list))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1788,7 +1788,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat <- function(internal, index
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1807,19 +1807,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat <- function(internal, index
cov_mat = cov_mat)
)
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1841,7 +1841,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat_v2 <- function(internal, in
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1860,19 +1860,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat_v2 <- function(internal, in
cov_mat = cov_mat)
)
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1894,7 +1894,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- function
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1914,19 +1914,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- function
# Here we first put the inner list together and then the whole thing. Maybe exist another faster way!
dt = as.data.table(do.call(rbind, lapply(result_list, function(inner_list) do.call(rbind, inner_list))))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -1948,7 +1948,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube <- function(internal, index_feat
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -1975,19 +1975,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube <- function(internal, index_feat
dim(result_cube) <- c(prod(dims[-2]), dims[2])
dt = as.data.table(result_cube)
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -2009,8 +2009,8 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube_v2 <- function(internal, index_f
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
- n_combinations_now <- length(index_features)
+ n_coalitions <- internal$parameters$n_coalitions
+ n_coalitions_now <- length(index_features)
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -2028,22 +2028,22 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube_v2 <- function(internal, index_f
cov_mat = cov_mat)
# Reshape and convert to data.table
- dim(dt) = c(n_combinations_now*n_explain*n_samples, n_features)
+ dim(dt) = c(n_coalitions_now*n_explain*n_samples, n_features)
print(system.time({dt = as.data.table(dt)}, gcFirst = FALSE))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -2065,7 +2065,7 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_std_list <- function(internal, index_
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
@@ -2090,19 +2090,19 @@ prepare_data_gaussian_new_v5_rnorm_cpp_fix_std_list <- function(internal, index_
# Here we first put the inner list together and then the whole thing. Maybe exist another faster way!
dt = as.data.table(do.call(rbind, result_list))
setnames(dt, feature_names)
- dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+ dt[, "id_coalition" := rep(seq(nrow(S)), each = n_samples * n_explain)]
dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
- data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+ data.table::setcolorder(dt, c("id_coalition", "id", feature_names))
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -2126,19 +2126,19 @@ prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) {
n_features <- internal$parameters$n_features
n_samples <- internal$parameters$n_samples
feature_names <- internal$parameters$feature_names
- n_combinations <- internal$parameters$n_combinations
+ n_coalitions <- internal$parameters$n_coalitions
# Extract the relevant coalitions specified in `index_features` from `S`.
# This will always be called as `index_features` is never NULL.
S <- if (!is.null(index_features)) S[index_features, , drop = FALSE]
- n_combinations_in_this_batch <- nrow(S)
+ n_coalitions_in_this_batch <- nrow(S)
# Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples.
- B <- matrix(nrow = n_samples * n_combinations_in_this_batch, ncol = n_features)
+ B <- matrix(nrow = n_samples * n_coalitions_in_this_batch, ncol = n_features)
class(B) <- "numeric"
.Call("rmvnCpp",
- n_ = n_samples * n_combinations_in_this_batch,
+ n_ = n_samples * n_coalitions_in_this_batch,
mu_ = rep(0, n_features),
sigma_ = diag(n_features),
ncores_ = 1,
@@ -2148,7 +2148,7 @@ prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) {
)
# Indices of the start for the combinations
- B_indices <- n_samples * (seq(0, n_combinations_in_this_batch)) + 1
+ B_indices <- n_samples * (seq(0, n_coalitions_in_this_batch)) + 1
# Generate a data table containing all Monte Carlo samples for all test observations and coalitions
dt <- data.table::rbindlist(
@@ -2221,18 +2221,18 @@ prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) {
)
}
),
- idcol = "id_combination"
+ idcol = "id_coalition"
)
- # Update the id_combination. This will always be called as `index_features` is never NULL.
- if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]]
+ # Update the id_coalition. This will always be called as `index_features` is never NULL.
+ if (!is.null(index_features)) dt[, id_coalition := index_features[id_coalition]]
# Add uniform weights
dt[, w := 1 / n_samples]
# Remove:
# This is not needed when we assume that the empty and grand coalitions will never be present
- # dt[id_combination %in% c(1, n_combinations), w := 1]
+ # dt[id_coalition %in% c(1, n_coalitions), w := 1]
# Return the MC samples
return(dt)
@@ -2298,7 +2298,7 @@ prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) {
predict_model <- NULL
get_model_specs <- NULL
timing <- TRUE
- n_combinations <- NULL
+ n_coalitions <- NULL
group <- NULL
feature_specs <- get_feature_specs(get_model_specs, model)
n_batches <- 1
@@ -2309,7 +2309,7 @@ prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) {
x_explain = x_explain,
approach = approach,
prediction_zero = prediction_zero,
- n_combinations = n_combinations,
+ n_coalitions = n_coalitions,
group = group,
n_samples = n_samples,
n_batches = n_batches,
@@ -2688,25 +2688,25 @@ rbind(one_coalition_time_old,
one_coalition_time_new_v6)
internal$objects$S[internal$objects$S_batch$`1`[look_at_coalition], , drop = FALSE]
-means_old <- one_coalition_res_old[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_old2 <- one_coalition_res_old2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v1 <- one_coalition_res_new_v1[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v2 <- one_coalition_res_new_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v3 <- one_coalition_res_new_v3[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v4 <- one_coalition_res_new_v4[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5 <- one_coalition_res_new_v5[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm <- one_coalition_res_new_v5_rnorm[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_v2 <- one_coalition_res_new_v5_rnorm_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp <- one_coalition_res_new_v5_rnorm_cpp[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_with_wrap <- one_coalition_res_new_v5_rnorm_cpp_with_wrap[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_v2 <- one_coalition_res_new_v5_rnorm_cpp_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_large_mat <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_large_mat_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_cube <- one_coalition_res_new_v5_rnorm_cpp_fix_cube[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_cube_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_cube_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- one_coalition_res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v5_rnorm_cpp_fix_std_list <- one_coalition_res_new_v5_rnorm_cpp_fix_std_list[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
-means_v6 <- one_coalition_res_new_v6[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)]
+means_old <- one_coalition_res_old[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_old2 <- one_coalition_res_old2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v1 <- one_coalition_res_new_v1[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v2 <- one_coalition_res_new_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v3 <- one_coalition_res_new_v3[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v4 <- one_coalition_res_new_v4[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5 <- one_coalition_res_new_v5[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm <- one_coalition_res_new_v5_rnorm[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_v2 <- one_coalition_res_new_v5_rnorm_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp <- one_coalition_res_new_v5_rnorm_cpp[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_with_wrap <- one_coalition_res_new_v5_rnorm_cpp_with_wrap[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_v2 <- one_coalition_res_new_v5_rnorm_cpp_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_large_mat <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_large_mat_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_cube <- one_coalition_res_new_v5_rnorm_cpp_fix_cube[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_cube_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_cube_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- one_coalition_res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v5_rnorm_cpp_fix_std_list <- one_coalition_res_new_v5_rnorm_cpp_fix_std_list[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
+means_v6 <- one_coalition_res_new_v6[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_coalition, id)]
# They are all in the same ballpark, so the differences are due to sampling.
# This is supported by the fact that mean_old and mean_old2 use the same old code, and the difference there is the
diff --git a/inst/scripts/devel/compare_explain_batch.R b/inst/scripts/devel/compare_explain_batch.R
index cedf257fb..45473a747 100644
--- a/inst/scripts/devel/compare_explain_batch.R
+++ b/inst/scripts/devel/compare_explain_batch.R
@@ -23,7 +23,7 @@ model <- xgboost(
# THIS IS GENERATED FROM MASTER BRANCH
# Prepare the data for explanation
library(shapr)
-explainer <- shapr(x_train, model,n_combinations = 100)
+explainer <- shapr(x_train, model,n_coalitions = 100)
p = mean(y_train)
gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000)
emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000)
@@ -40,7 +40,7 @@ detach("package:shapr", unload = TRUE)
devtools::load_all()
nobs = 6
x_test <- as.matrix(Boston[1:nobs, x_var])
-explainer <- shapr(x_train, model,n_combinations = 100)
+explainer <- shapr(x_train, model,n_coalitions = 100)
p = mean(y_train)
gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000, n_batches = 1)
emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000, n_batches = 1)
@@ -87,7 +87,7 @@ explain.independence2 <- function(x, explainer, approach, prediction_zero,
prepare_data.independence2 <- function(x, index_features = NULL, ...) {
- id <- id_combination <- w <- NULL # due to NSE notes in R CMD check
+ id <- id_coalition <- w <- NULL # due to NSE notes in R CMD check
if (is.null(index_features)) {
index_features <- x$X[, .I]
@@ -122,7 +122,7 @@ prepare_data.independence2 <- function(x, index_features = NULL, ...) {
# Add keys
dt_l[[i]] <- data.table::as.data.table(dt_p)
data.table::setnames(dt_l[[i]], colnames(x_train))
- dt_l[[i]][, id_combination := index_s]
+ dt_l[[i]][, id_coalition := index_s]
dt_l[[i]][, w := w] # IS THIS NECESSARY?
dt_l[[i]][, id := i]
}
diff --git a/inst/scripts/devel/devel_batch_testing.R b/inst/scripts/devel/devel_batch_testing.R
new file mode 100644
index 000000000..1ad762b98
--- /dev/null
+++ b/inst/scripts/devel/devel_batch_testing.R
@@ -0,0 +1,67 @@
+
+#remotes::install_github("NorskRegnesentral/shapr") # Installs GitHub version of shapr
+
+library(shapr)
+library(data.table)
+library(MASS)
+library(Matrix)
+
+# Just sample some data to work with
+m <- 9
+n_train <- 10000
+n_explain <- 10
+rho_1 <- 0.5
+rho_2 <- 0
+rho_3 <- 0.4
+Sigma_1 <- matrix(rho_1, m/3, m/3) + diag(m/3) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/3, m/3) + diag(m/3) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/3, m/3) + diag(m/3) * (1 - rho_3)
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3))
+mu <- rep(0,m)
+
+set.seed(123)
+
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+beta <- c(4:1, rep(0, m - 4))
+alpha <- 1
+y_train <- as.vector(alpha + as.matrix(x_train) %*% beta + rnorm(n_train, 0, 1))
+y_explain <- alpha + as.matrix(x_explain) %*% beta + rnorm(n_explain, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+p0 <- mean(y_train)
+
+# We need to pass a model object and a proper prediction function to shapr for it to work, but it can be anything as we don't use it
+model <- lm(y_train ~ ., data = x_train)
+
+### First run proper shapr call on this
+library(progressr)
+library(future)
+# Not necessary, and only apply to the explain() call below
+progressr::handlers(global = TRUE) # For progress bars
+#future::plan(multisession, workers = 2) # Parallized computations
+#future::plan(sequential)
+
+expl <- explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "ctree",
+ prediction_zero = p0,
+ n_batches = 100,
+ n_samples = 1000,
+ adaptive = TRUE,
+ print_iter_info = TRUE,
+ print_shapleyres = TRUE)
+
+
+n_combinations <- 5
+max_batch_size <- 10
+min_n_batches <- 10
+
diff --git a/inst/scripts/devel/devel_convergence_branch.R b/inst/scripts/devel/devel_convergence_branch.R
new file mode 100644
index 000000000..71ee609e3
--- /dev/null
+++ b/inst/scripts/devel/devel_convergence_branch.R
@@ -0,0 +1,148 @@
+library(xgboost)
+#library(shapr)
+
+data("airquality")
+data <- data.table::as.data.table(airquality)
+data <- data[complete.cases(data), ]
+data[,new1 :=sqrt(Wind*Ozone)]
+data[,new2 :=sqrt(Wind*Temp)]
+data[,new3 :=sqrt(Wind*Day)]
+data[,new4 :=sqrt(Wind*Solar.R)]
+data[,new5 :=rnorm(.N)]
+data[,new6 :=rnorm(.N)]
+data[,new7 :=rnorm(.N)]
+
+
+x_var <- c("Solar.R", "Wind", "Temp", "Month","Day","new1","new2","new3","new4","new5")#"new6","new7")
+y_var <- "Ozone"
+
+ind_x_explain <- 1:20
+x_train <- data[-ind_x_explain, ..x_var]
+y_train <- data[-ind_x_explain, get(y_var)]
+x_explain <- data[ind_x_explain, ..x_var]
+
+# Looking at the dependence between the features
+cor(x_train)
+
+# Fitting a basic xgboost model to the training data
+model <- xgboost(
+ data = as.matrix(x_train),
+ label = y_train,
+ nround = 20,
+ verbose = FALSE
+)
+
+# Specifying the phi_0, i.e. the expected prediction without any features
+p0 <- mean(y_train)
+
+# Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
+# the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default)
+explanation_adaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ max_n_coalitions = 500,
+ prediction_zero = p0,
+ adaptive = TRUE,
+ print_shapleyres = TRUE, # tmp
+ print_iter_info = TRUE, # tmp
+ shapley_reweighting = "on_N"
+)
+
+explanation_adaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "ctree",
+ n_coalitions = 500,
+ prediction_zero = p0,
+ adaptive = TRUE,
+ print_shapleyres = TRUE, # tmp
+ print_iter_info = TRUE, # tmp
+ shapley_reweighting = "on_N"
+)
+
+
+explanation_nonadaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_coalitions = 400,
+ prediction_zero = p0,
+ adaptive = FALSE
+)
+
+
+explanation_adaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_coalitions = 500,
+ prediction_zero = p0,
+ adaptive = TRUE,
+ adaptive_arguments = list(initial_n_coalitions=10,convergence_tolerance=0.0001),
+ print_shapleyres = TRUE, # tmp
+ print_iter_info = TRUE, # tmp
+ shapley_reweighting = "on_N"
+)
+
+
+
+
+
+
+
+
+
+
+plot(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ explanation_adaptive$internal$output$iter_objects$dt_iter_shapley_sd[explain_id==1,Solar.R],type="l")
+sd_full <- explanation_adaptive$internal$output$iter_objects$dt_iter_shapley_sd[explain_id==1][.N,Solar.R]
+n_samples_full <- explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res[.N,n_current_samples]
+sd_full0 <- sd_full*sqrt(n_samples_full)
+lines(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ sd_full0/sqrt(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples),type="l",col=2)
+
+
+plot(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$estimated_required_samples,type="l",ylim=c(0,4000),lwd=4)
+for(i in 1:20){
+ lines(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res[[5+i]],type="l",col=1+i)
+}
+
+
+plot(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ explanation_adaptive$internal$output$iter_objects$dt_iter_shapley_sd[explain_id==1,Solar.R],type="l",ylim=c(0,2))
+sd_full <- explanation_adaptive$internal$output$iter_objects$dt_iter_shapley_sd[explain_id==1][.N,Solar.R]
+n_samples_full <- explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res[.N,n_current_samples]
+sd_full0 <- sd_full*sqrt(n_samples_full)
+lines(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ sd_full0/sqrt(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples),type="l",col=2,lwd=3)
+
+for(i in 1:20){
+ lines(explanation_adaptive$internal$output$iter_objects$dt_iter_convergence_res$n_current_samples,
+ explanation_adaptive$internal$output$iter_objects$dt_iter_shapley_sd[explain_id==i,Solar.R],type="l",col=1+i)
+}
+
+
+
+lines(explanation_adaptive$internal$output$dt_iter_convergence_res$n_current_samples,
+ sd_full0/sqrt(explanation_adaptive$internal$output$dt_iter_convergence_res$n_current_samples),type="l",col=2)
+
+
+plot(explanation_adaptive$internal$output$dt_iter_convergence_res$estimated_required_samples)
+
+explanation_regular <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_coalitions = NULL,
+ prediction_zero = p0,
+ adaptive = FALSE
+)
+
diff --git a/inst/scripts/devel/devel_non_exact_grouping.R b/inst/scripts/devel/devel_non_exact_grouping.R
index 02f3196da..bb8cf5d3b 100644
--- a/inst/scripts/devel/devel_non_exact_grouping.R
+++ b/inst/scripts/devel/devel_non_exact_grouping.R
@@ -1,5 +1,5 @@
-### NOTE: THIS DOES NO LONGER WORK AS WE SWITCH TO exact when a large n_combinations is used, but the checks
+### NOTE: THIS DOES NO LONGER WORK AS WE SWITCH TO exact when a large n_coalitions is used, but the checks
### confirms the code works as intended.
library(xgboost)
@@ -30,7 +30,7 @@ model <- xgboost(
group <- list(A=x_var[1:3],B=x_var[4:5],C=x_var[7],D=x_var[c(6,8)],E=x_var[9])
-explainer1 <- shapr(x_train, model,group = group,n_combinations=10^ 6)
+explainer1 <- shapr(x_train, model,group = group,n_coalitions=10^ 6)
explainer2 <- shapr(x_train, model,group = group)
diff --git a/inst/scripts/devel/devel_verbose.R b/inst/scripts/devel/devel_verbose.R
new file mode 100644
index 000000000..f15bf15ec
--- /dev/null
+++ b/inst/scripts/devel/devel_verbose.R
@@ -0,0 +1,135 @@
+ex <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive_arguments = list(
+ initial_n_coalitions = 6,
+ convergence_tolerance = 0.0005,
+ reduction_factor_vec = rep(10^(-6), 10),
+ max_iter = 8
+ ),
+ adaptive = TRUE,verbose=c("basic","progress")
+)
+
+ex <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "regression_separate",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = TRUE,verbose=c("vS_details")
+)
+ex <- explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "regression_separate",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = TRUE,verbose=c("basic","progress","vS_details"),
+ regression.model = parsnip::decision_tree(tree_depth = hardhat::tune(), engine = "rpart", mode = "regression"),
+ regression.tune_values = dials::grid_regular(dials::tree_depth(), levels = 4),
+ regression.vfold_cv_para = list(v = 5)
+)
+
+ex <- explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "regression_surrogate",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = FALSE,verbose=c("basic","vS_details"),
+ regression.model = parsnip::decision_tree(tree_depth = hardhat::tune(), engine = "rpart", mode = "regression"),
+ regression.tune_values = dials::grid_regular(dials::tree_depth(), levels = 4),
+ regression.vfold_cv_para = list(v = 5)
+)
+
+
+future::plan("multisession", workers = 4)
+progressr::handlers(global = TRUE)
+
+
+ex <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "vaeac",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = FALSE,verbose=c("basic","progress","vS_details"),
+ n_MC_samples = 100,
+ vaeac.epochs = 3
+)
+
+ex2 <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "vaeac",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = FALSE,verbose=c("basic","progress","vS_details"),
+ n_MC_samples = 100,
+ vaeac.extra_parameters = list(
+ vaeac.pretrained_vaeac_model = ex$internal$parameters$vaeac
+ )
+)
+
+
+
+vaeac.extra_parameters = list(
+ vaeac.pretrained_vaeac_model = explanation$internal$parameters$vaeac
+)
+
+
+ex <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "regression_separate",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive = FALSE,verbose=c("basic")
+)
+
+
+ex <- explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "empirical",
+ prediction_zero = p0,
+ max_n_coalitions = 30,
+ adaptive_arguments = list(
+ initial_n_coalitions = 6,
+ convergence_tolerance = 0.0005,
+ reduction_factor_vec = rep(10^(-6), 10),
+ max_iter = 8
+ ),
+ adaptive = TRUE,verbose=c("basic","convergence","shapley")
+)
+
+
+explain(
+ testing = TRUE,
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive = TRUE,
+ adaptive_arguments <- list(n_initial_)
+ verbose = c("basic"),
+ paired_shap_sampling = TRUE
+)
diff --git a/inst/scripts/devel/future_testing.R b/inst/scripts/devel/future_testing.R
new file mode 100644
index 000000000..6d6734f76
--- /dev/null
+++ b/inst/scripts/devel/future_testing.R
@@ -0,0 +1,56 @@
+
+plan(multisession, workers = 5) # Adjust the number of workers as needed
+plan(sequential) # Adjust the number of workers as needed
+
+fun <- function(x) {
+ print(x)
+ if(z==0){
+ if(x==5){
+ Sys.sleep(1)
+ z <<- 100
+ }
+ return(x+z)
+ } else {
+ return(NA)
+ }
+}
+
+z <- 0
+
+
+
+
+plan(multisession, workers = 5)
+plan(multicore, workers = 5)
+
+plan(sequential)
+
+fun2 <- function(x){
+ x^2
+}
+
+
+start <- proc.time()
+for(i in 1:100){
+ future.apply::future_lapply(1:10, fun2)
+}
+print(proc.time()-start)
+#user system elapsed
+#14.985 0.045 20.323
+
+start <- proc.time()
+for(i in 1:10){
+ future.apply::future_lapply(rep(1:10,10), fun2)
+}
+print(proc.time()-start)
+#user system elapsed
+#1.504 0.005 2.009
+
+start <- proc.time()
+aa=future.apply::future_lapply(rep(1:10,100), fun2)
+print(proc.time()-start)
+#user system elapsed
+#0.146 0.000 0.202
+
+
+
diff --git a/inst/scripts/devel/real_data_iterative_kernelshap.R b/inst/scripts/devel/real_data_iterative_kernelshap.R
new file mode 100644
index 000000000..00447ee0b
--- /dev/null
+++ b/inst/scripts/devel/real_data_iterative_kernelshap.R
@@ -0,0 +1,276 @@
+
+### Upcoming generalization:
+
+#1. Use non-linear truth (xgboost or so)
+#2. Even more features
+
+print(Sys.time())
+library(data.table)
+library(shapr)
+library(ranger)
+
+# Give me some credit data set
+gmc <- read.table("/nr/project/stat//BigInsight//Projects//Explanations//Counterfactual_kode//Carla_datasets//GiveMeSomeCredit-training.csv",header=TRUE, sep=",")
+foo <- apply(gmc,1,sum)
+ind <- which(is.na(foo))
+gmc <- gmc[-ind,]
+
+
+nobs <- dim(gmc)[1]
+ind <- sample(x=nobs, size=round(0.75*nobs))
+gmcTrain <- gmc[ind,-1]
+gmcTest <- gmc[-ind,-1]
+gmcTrain <- as.data.table(gmcTrain)
+gmcTest <- as.data.table(gmcTest)
+
+integer_columns <- sapply(gmcTrain, is.integer) # Identify integer columns
+integer_columns = integer_columns[2:length(integer_columns)]
+gmcTrain[, c("RevolvingUtilizationOfUnsecuredLines", "age",
+"NumberOfTime30.59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome",
+"NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate",
+"NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfDependents"):=
+lapply(.SD, as.numeric), .SDcols = c("RevolvingUtilizationOfUnsecuredLines", "age",
+"NumberOfTime30.59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome",
+"NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate",
+"NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfDependents")]
+integer_columns <- sapply(gmcTest, is.integer) # Identify integer columns
+integer_columns = integer_columns[2:length(integer_columns)]
+gmcTest[, c("RevolvingUtilizationOfUnsecuredLines", "age",
+"NumberOfTime30.59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome",
+"NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate",
+"NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfDependents"):=
+lapply(.SD, as.numeric), .SDcols = c("RevolvingUtilizationOfUnsecuredLines", "age",
+"NumberOfTime30.59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome",
+"NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate",
+"NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfDependents")]
+
+# model <- ranger(SeriousDlqin2yrs ~ ., data = gmcTrain, num.trees = 500, num.threads = 6,
+# verbose = TRUE,
+# probability = FALSE,
+# importance = "impurity",
+# mtry = sqrt(11),
+# seed = 3045)
+library(hmeasure)
+#pred.rf <- predict(model, data = gmcTest)
+#results <- HMeasure(unlist(as.vector(gmcTest[,1])),pred.rf$predictions,threshold=0.15)
+#results$metrics$AUC
+
+y_train = gmcTrain$SeriousDlqin2yrs
+x_train = gmcTrain[,-1]
+y_explain = gmcTest$SeriousDlqin2yrs
+x_explain = gmcTest[,-1]
+
+set.seed(123)
+model <- xgboost(
+ data = as.matrix(x_train),
+ label = y_train,
+ nround = 50,
+ verbose = FALSE,params = list(objective = "binary:logistic")
+)
+pred.xgb <- predict(model, newdata = as.matrix(x_explain))
+results <- HMeasure(as.vector(y_explain),pred.xgb,threshold=0.15)
+results$metrics$AUC
+
+
+set.seed(123)
+
+inds_train = sample(1:nrow(x_train), 9000)
+x_train = x_train[inds_train,]
+y_train = y_train[inds_train]
+
+m = ncol(x_train)
+
+
+p0 <- mean(y_train)
+mu = colMeans(x_train)
+Sigma = cov(x_train)
+
+### First run proper shapr call on this
+
+sim_results_saving_folder = "/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/gmc_data_v3/"#"../effektiv_shapley_output/"
+shapley_reweighting_strategy = "none"
+
+predict_model_xgb <- function(object,newdata){
+ xgboost:::predict.xgb.Booster(object,as.matrix(newdata))
+}
+
+
+preds_explain <- predict_model_xgb(model,x_explain)
+head(order(-preds_explain),50)
+inds_1 <- head(order(-preds_explain),50)
+set.seed(123)
+inds_2 <- sample(which(preds_explain>quantile(preds_explain,0.9) & preds_explain 0.05
+shapley_threshold_val <- 0.02
+shapley_threshold_prob <- 0.2
+
+source("inst/scripts/devel/iterative_kernelshap_sourcefuncs.R")
+
+testObs_computed_vec <- inds# seq_len(n_explain)
+runres_list <- runcomps_list <- list()
+
+cutoff_feats = colnames(x_train)
+
+run_obj_list <- list()
+for(kk in seq_along(testObs_computed_vec)){
+ testObs_computed <- testObs_computed_vec[kk]
+ full_pred <- predict_model_xgb(model,x_explain)[testObs_computed]
+ shapsum_other_features <- 0
+
+
+ run <- iterative_kshap_func(model,x_explain,x_train,
+ testObs_computed = testObs_computed,
+ cutoff_feats = cutoff_feats,
+ initial_n_coalitions = 50,
+ full_pred = full_pred,
+ shapsum_other_features = shapsum_other_features,
+ p0 = p0,
+ predict_model = predict_model_xgb,
+ shapley_threshold_val = shapley_threshold_val,
+ shapley_threshold_prob = shapley_threshold_prob,
+ approach = approach,
+ shapley_reweighting_strategy = shapley_reweighting_strategy)
+
+ runres_list[[kk]] <- run$kshap_final
+ runcomps_list[[kk]] <- sum(sapply(run$keep_list,"[[","no_computed_combinations"))
+ run_obj_list[[kk]] <- run
+ print(kk)
+ print(Sys.time())
+}
+
+est <- rbindlist(runres_list)
+est[,other_features:=NULL]
+fwrite(est,paste0(sim_results_saving_folder,"iterative_shapley_values_", shapley_reweighting_strategy, ".csv"))
+
+expl_approx <- matrix(0, nrow = length(inds), ncol = m+1)
+expl_approx_obj_list <- list()
+for (i in seq_along(testObs_computed_vec)){
+ expl_approx_obj <- shapr::explain(model = model,
+ x_explain= x_explain[testObs_computed_vec[i],],
+ x_train = x_train,
+ approach = approach,
+ prediction_zero = p0,
+ n_coalitions = runcomps_list[[i]])
+ expl_approx[i,] = unlist(expl_approx_obj$shapley_values)
+ expl_approx_obj_list[[i]] <- expl_approx_obj
+}
+expl_approx <- as.data.table(expl_approx)
+truth <- expl$shapley_values
+
+colnames(expl_approx) <- colnames(truth)
+fwrite(expl_approx,paste0(sim_results_saving_folder,"approx_shapley_values_", shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(est-truth)
+rmse_vec <- sqrt(colMeans((est-truth)^2))
+mae_vec <- colMeans(abs(est-truth))
+
+bias_vec_approx <- colMeans(expl_approx-truth)
+rmse_vec_approx <- sqrt(colMeans((expl_approx-truth)^2))
+mae_vec_approx <- colMeans(abs(expl_approx-truth))
+
+save.image(paste0(sim_results_saving_folder, "iterative_kernelshap_lingauss_p12_", shapley_reweighting_strategy, ".RData"))
+
+hist(unlist(runcomps_list),breaks = 20)
+
+summary(unlist(runcomps_list))
+
+
+run$kshap_final
+sum(unlist(run$kshap_final))
+full_pred
+
+print(Sys.time())
+
+# TODO: Må finne ut av hvorfor det ikke gir korrekt sum her...
+# Hvis det er noen variabler som ble ekskludert, så må jeg legge til disse i summen for å få prediksjonen til modellen.
+# for(i in 1:18){
+# print(sum(unlist(run$keep_list[[i]]$kshap_est_dt[,-1]))+run$keep_list[[i]]$shap_it_excluded_features)
+# #print(run$keep_list[[i]]$shap_it_excluded_features)
+# }
+
+# run$kshap_it_est_dt
+
+
+
+# run$kshap_final
+# expl$shapley_values
+
+
+
+
+# kshap_final <- copy(run$kshap_est_dt_list[,-1])
+# setnafill(kshap_final,"locf")
+# kshap_final[.N,] # final estimate
+
+# sum(unlist(kshap_final[.N,]))
+
+# sum(unlist(expl$shapley_values[testObs_computed,]))
+
+
+
+
+
+
+
+
+
+
+# cutoff_feats <- paste0("VV",1:6)
+# testObs_computed <- 5
+
+# full_pred <- predict(model,x_explain)[5]
+# p0 <- mean(y_train)
+# pred_not_to_decompose <- sum(expl$shapley_values[5,VV7:VV9])
+
+
+# run_minor <- iterative_kshap_func(model,x_explain,x_train,
+# testObs_computed = 5,
+# cutoff_feats = cutoff_feats,
+# full_pred = full_pred,
+# pred_not_to_decompose = pred_not_to_decompose,
+# p0 = p0,
+# predict_model = predict.lm,shapley_threshold_val = 0)
+
+
+# aa=run$keep_list[[8]]$dt_vS
+
+# bb=run_minor$keep_list[[6]]$dt_vS
+# setnames(bb,"p_hat_1","p_hat_1_approx")
+
+# cc=merge(aa,bb)
+# cc[,diff:=p_hat_1-p_hat_1_approx]
+
+
+# TODO:
+
+# 1. Run example with gaussian features where the truth is known in advance in a large setting, with e.g. 12 features or so. I want the estimate
+# both for the full 12 features, and for subsets where one is removed.
+# 2.
+
+# Utfordringer:
+# 1. Hvordan justere vekter og samplingrutine fra subset S når man allerede har et utvalg sampler (som også er noe biased).
+# 2. Bruker altså E[f(x1=x1*,x2,x3=x3*,x4)|x1=x1*] som proxy for E[f(x1=x1*,x2,x3=x3*,x4)|x1=x1*,x3=x3*],
+#men hva med E[f(x1=x1*,x2,x3,x4=x4*)|x1=x1*,x4=x4*]? Burde jeg bruke den for
+#E[f(x1=x1*,x2,x3=x3*,x4=x4*)|x1=x1*,x4=x4*]?
+# 3. Når jeg fjerner en variabel (som har lite å si), så settes shapley-verdien til det den har per da. MEN den verdien vil trolig være noe biased fordi den fjernes første gangen den går over terskelverdiene
+# jeg har satt for ekskludering.
+
diff --git a/inst/scripts/devel/real_data_iterative_kernelshap_analyze_results.R b/inst/scripts/devel/real_data_iterative_kernelshap_analyze_results.R
new file mode 100644
index 000000000..bbacbef75
--- /dev/null
+++ b/inst/scripts/devel/real_data_iterative_kernelshap_analyze_results.R
@@ -0,0 +1,135 @@
+library(data.table)
+shapley_reweighting_strategy = "none"
+shapley_threshold_val <- 0.2
+
+
+
+sim_results_folder = "/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/gmc_data_v3/"
+
+load(paste0("/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/gmc_data_v3/iterative_kernelshap_lingauss_p12_", shapley_reweighting_strategy, ".RData"))
+
+
+
+exact_vals = fread(paste0(sim_results_folder,"exact_shapley_values_", shapley_reweighting_strategy, ".csv"))
+# names(exact_vals) <- c("phi0", paste0("VV",1:12))
+iterative_vals = fread(paste0(sim_results_folder,"iterative_shapley_values_", shapley_reweighting_strategy, ".csv"))
+approx_vals = fread(paste0(sim_results_folder,"approx_shapley_values_", shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(exact_vals - iterative_vals)
+rmse_vec <- sqrt(colMeans((exact_vals - iterative_vals)^2))
+mae_vec <- colMeans(abs(exact_vals - iterative_vals))
+
+bias_vec_approx <- colMeans(exact_vals - approx_vals)
+rmse_vec_approx <- sqrt(colMeans((exact_vals - approx_vals)^2))
+mae_vec_approx <- colMeans(abs(exact_vals - approx_vals))
+
+treeshap_vals <- as.data.table(predict(model,newdata=as.matrix(x_explain),predcontrib = TRUE))
+setnames(treeshap_vals,"BIAS","none")
+setcolorder(treeshap_vals,"none")
+head(treeshap_vals)
+mae_vec_treeshap <- colMeans(abs(exact_vals - treeshap_vals))
+mean(mae_vec_treeshap[-1])
+
+
+library(ggplot2)
+
+# Create a data frame for the bar plot
+
+# MAE
+df <- data.frame(matrix(0, length(mae_vec)*2, 3))
+colnames(df) <- c("MAE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- mae_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- mae_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+df <- as.data.table(df)
+df[,features0:=.GRP,by="features"]
+df[,features1:=paste0("VV",features0)]
+df[,features1:=factor(features1,levels=c(paste0("VV",1:11)))]
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features1, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p, width = 10, height = 5)
+
+
+runcomps_list
+
+df = data.frame(matrix(0, length(runcomps_list), 1))
+colnames(df) <- c("n_rows")
+df$n_rows <- as.numeric(runcomps_list)
+
+p <- ggplot(df, aes(n_rows)) +
+ geom_histogram()
+ggsave(paste0(sim_results_folder, "n_rows.png"), plot = p,width = 10, height = 5)
+
+
+
+
+
+
+
+
+
+# RMSE
+df <- data.frame(matrix(0, length(rmse_vec)*2, 3))
+colnames(df) <- c("RMSE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- rmse_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- rmse_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = RMSE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "rmse_comparison.png"), plot = p)
+
+
+# Bias
+df <- data.frame(matrix(0, length(bias_vec)*2, 3))
+colnames(df) <- c("abs_bias", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- abs(bias_vec_approx)
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- abs(bias_vec)
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = abs_bias, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "bias_comparison.png"), plot = p)
+
+# Number of sample used
+runcomps_list
+
+df = data.frame(matrix(0, length(runcomps_list), 1))
+colnames(df) <- c("n_rows")
+df$n_rows <- as.numeric(runcomps_list)
+
+p <- ggplot(df, aes(n_rows)) +
+ geom_histogram()
+ggsave(paste0(sim_results_folder, "n_rows.png"), plot = p)
+
+#### Just looking at the largest predictions
+
+preds <- rowSums(exact_vals)
+
+these <- head(order(-preds),10)
+
+preds[these]-rowSums(iterative_vals)[these]
+
+bias_vec <- colMeans(exact_vals[these] - iterative_vals[these])
+rmse_vec <- sqrt(colMeans((exact_vals[these] - iterative_vals[these])^2))
+mae_vec <- colMeans(abs(exact_vals[these] - iterative_vals[these]))
+
+bias_vec_approx <- colMeans(exact_vals[these] - approx_vals[these])
+rmse_vec_approx <- sqrt(colMeans((exact_vals[these] - approx_vals[these])^2))
+mae_vec_approx <- colMeans(abs(exact_vals[these] - approx_vals[these]))
+
+
+
diff --git a/inst/scripts/devel/same_seed_as_master.R b/inst/scripts/devel/same_seed_as_master.R
index a06469cb2..977c770f7 100644
--- a/inst/scripts/devel/same_seed_as_master.R
+++ b/inst/scripts/devel/same_seed_as_master.R
@@ -20,7 +20,7 @@ model <- xgboost(
)
# THIS IS GENERATED FROM MASTER BRANCH
# Prepare the data for explanation
-explainer <- shapr(x_train, model,n_combinations = 100)
+explainer <- shapr(x_train, model,n_coalitions = 100)
p = mean(y_train)
gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000)
emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000)
diff --git a/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_analyze_results.R b/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_analyze_results.R
new file mode 100644
index 000000000..51feb23d2
--- /dev/null
+++ b/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_analyze_results.R
@@ -0,0 +1,88 @@
+library(data.table)
+shapley_reweighting_strategy = "none"
+shapley_threshold_val <- 0.2
+
+
+
+sim_results_folder = "/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/sim_lingauss_v2/"
+
+load(paste0(sim_results_folder,"iterative_kernelshap_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".RData"))
+
+
+exact_vals = fread(paste0(sim_results_folder,"exact_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+names(exact_vals) <- c("phi0", paste0("VV",1:12))
+iterative_vals = fread(paste0(sim_results_folder,"iterative_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+approx_vals = fread(paste0(sim_results_folder,"approx_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(exact_vals - iterative_vals)
+rmse_vec <- sqrt(colMeans((exact_vals - iterative_vals)^2))
+mae_vec <- colMeans(abs(exact_vals - iterative_vals))
+
+bias_vec_approx <- colMeans(exact_vals - approx_vals)
+rmse_vec_approx <- sqrt(colMeans((exact_vals - approx_vals)^2))
+mae_vec_approx <- colMeans(abs(exact_vals - approx_vals))
+
+library(ggplot2)
+
+# Create a data frame for the bar plot
+
+# MAE
+df <- data.frame(matrix(0, length(mae_vec)*2, 3))
+colnames(df) <- c("MAE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- mae_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- mae_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+df <- as.data.table(df)
+df[,features:=factor(features,levels=c("phi0",paste0("VV",1:12)))]
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p, width = 10, height = 5)
+
+
+# RMSE
+df <- data.frame(matrix(0, length(rmse_vec)*2, 3))
+colnames(df) <- c("RMSE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- rmse_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- rmse_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = RMSE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "rmse_comparison.png"), plot = p)
+
+
+# Bias
+df <- data.frame(matrix(0, length(bias_vec)*2, 3))
+colnames(df) <- c("abs_bias", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- abs(bias_vec_approx)
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- abs(bias_vec)
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = abs_bias, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "bias_comparison.png"), plot = p)
+
+# Number of sample used
+runcomps_list
+
+df = data.frame(matrix(0, length(runcomps_list), 1))
+colnames(df) <- c("n_rows")
+df$n_rows <- as.numeric(runcomps_list)
+
+p <- ggplot(df, aes(n_rows)) +
+ geom_histogram()
+ggsave(paste0(sim_results_folder, "n_rows.png"), plot = p)
+
diff --git a/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_v2.R b/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_v2.R
new file mode 100644
index 000000000..bedcbfcd0
--- /dev/null
+++ b/inst/scripts/devel/simtest_iterative_kernelshap_lingauss_v2.R
@@ -0,0 +1,261 @@
+
+### Upcoming generalization:
+
+#1. Use non-linear truth (xgboost or so)
+#2. Even more features
+
+
+library(data.table)
+library(MASS)
+library(Matrix)
+library(shapr)
+library(future)
+library(xgboost)
+
+shapley_threshold_prob <- 0.2
+shapley_threshold_val <- 0.1
+
+m <- 12
+n_train <- 5000
+n_explain <- 100
+rho_1 <- 0.5
+rho_2 <- 0.5
+rho_3 <- 0.5
+rho_4 <- 0
+Sigma_1 <- matrix(rho_1, m/4, m/4) + diag(m/4) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/4, m/4) + diag(m/4) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/4, m/4) + diag(m/4) * (1 - rho_3)
+Sigma_4 <- matrix(rho_4, m/4, m/4) + diag(m/4) * (1 - rho_4)
+
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3, Sigma_4))
+mu <- rep(0,m)
+
+library(corrplot)
+corrplot(Sigma)
+set.seed(123)
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+
+beta <- c(5:1, rep(0, m - 5))
+alpha <- 1
+y_train <- as.vector(alpha + as.matrix(x_train) %*% beta + rnorm(n_train, 0, 1))
+y_explain <- alpha + as.matrix(x_explain) %*% beta + rnorm(n_explain, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+set.seed(123)
+
+model <- lm(y_train ~ .,data = xy_train)
+
+pred_train <- predict(model, x_train)
+plot(unlist(x_train[,1]),pred_train)
+plot(unlist(x_train[,2]),pred_train)
+plot(unlist(x_train[,3]),pred_train)
+plot(unlist(x_train[,4]),pred_train)
+plot(unlist(x_train[,5]),pred_train)
+plot(unlist(x_train[,6]),pred_train)
+
+this_order <- order(unlist(x_train[,1]))
+
+plot(unlist(x_train[this_order,1]),pred_train[this_order],type="l")
+
+p0 <- mean(y_train)
+
+
+### First run proper shapr call on this
+
+sim_results_saving_folder = "/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/sim_lingauss_v2/"#"../effektiv_shapley_output/"
+shapley_reweighting_strategy = "none"
+
+set.seed(465132)
+inds = 1:n_explain
+progressr::handlers(global = TRUE)
+expl <- shapr::explain(model = model,
+ x_explain= x_explain[inds,],
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p0,Sigma=Sigma,mu=mu)
+
+fwrite(expl$shapley_values,paste0(sim_results_saving_folder,"exact_shapley_values_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+
+cutoff_feats <- paste0("VV",1:12)
+
+
+### Need to create an lm analogoue to pred_mod_xgb here
+
+
+set.seed(123)
+
+
+
+# These are the parameters for for interative_kshap_func
+n_samples <- 1000
+approach = "gaussian"
+
+# Reduce if < 10% prob of shapval > 0.2
+
+source("inst/scripts/devel/iterative_kernelshap_sourcefuncs.R")
+
+testObs_computed_vec <- inds# seq_len(n_explain)
+
+# Using threshold: 0.1
+runres_list <- runcomps_list <- list()
+for(kk in testObs_computed_vec){
+ testObs_computed <- testObs_computed_vec[kk]
+ full_pred <- predict(model,x_explain)[testObs_computed]
+ shapsum_other_features <- 0
+
+
+ run <- iterative_kshap_func(model,x_explain,x_train,
+ testObs_computed = testObs_computed,
+ cutoff_feats = cutoff_feats,
+ initial_n_coalitions = 50,
+ full_pred = full_pred,
+ shapsum_other_features = shapsum_other_features,
+ p0 = p0,
+ predict_model = predict.lm,
+ shapley_threshold_val = shapley_threshold_val,
+ shapley_threshold_prob = shapley_threshold_prob,
+ approach = approach,
+ n_samples = n_samples,
+ gaussian.mu = mu,
+ gaussian.cov_mat = Sigma,
+ shapley_reweighting_strategy = shapley_reweighting_strategy)
+ runres_list[[kk]] <- run$kshap_final
+ runcomps_list[[kk]] <- sum(sapply(run$keep_list,"[[","no_computed_combinations"))
+ print(kk)
+}
+
+est <- rbindlist(runres_list)
+est[,other_features:=NULL]
+fwrite(est,paste0(sim_results_saving_folder,"iterative_shapley_values_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+
+
+
+truth <- expl$shapley_values
+
+expl_approx <- matrix(0, nrow = length(inds), ncol = m+1)
+expl_approx_obj_list <- list()
+for (i in testObs_computed_vec){
+ expl_approx_obj <- shapr::explain(model = model,
+ x_explain= x_explain[inds[i],],
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p0,
+ n_coalitions = runcomps_list[[i]],
+ Sigma=Sigma,mu=mu)
+ expl_approx[i,] = unlist(expl_approx_obj$shapley_values)
+ expl_approx_obj_list[[i]] <- expl_approx_obj
+}
+expl_approx <- as.data.table(expl_approx)
+colnames(expl_approx) <- colnames(truth)
+fwrite(expl_approx,paste0(sim_results_saving_folder,"approx_shapley_values_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(est-truth)
+rmse_vec <- sqrt(colMeans((est-truth)^2))
+mae_vec <- colMeans(abs(est-truth))
+
+bias_vec_approx <- colMeans(expl_approx-truth)
+rmse_vec_approx <- sqrt(colMeans((expl_approx-truth)^2))
+mae_vec_approx <- colMeans(abs(expl_approx-truth))
+
+save.image(paste0(sim_results_saving_folder, "iterative_kernelshap_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".RData"))
+
+hist(unlist(runcomps_list),breaks = 20)
+
+summary(unlist(runcomps_list))
+
+
+run$kshap_final
+sum(unlist(run$kshap_final))
+full_pred
+
+
+
+
+
+
+
+
+# TODO: Må finne ut av hvorfor det ikke gir korrekt sum her...
+# Hvis det er noen variabler som ble ekskludert, så må jeg legge til disse i summen for å få prediksjonen til modellen.
+# for(i in 1:18){
+# print(sum(unlist(run$keep_list[[i]]$kshap_est_dt[,-1]))+run$keep_list[[i]]$shap_it_excluded_features)
+# #print(run$keep_list[[i]]$shap_it_excluded_features)
+# }
+
+# run$kshap_it_est_dt
+
+
+
+# run$kshap_final
+# expl$shapley_values
+
+
+
+
+# kshap_final <- copy(run$kshap_est_dt_list[,-1])
+# setnafill(kshap_final,"locf")
+# kshap_final[.N,] # final estimate
+
+# sum(unlist(kshap_final[.N,]))
+
+# sum(unlist(expl$shapley_values[testObs_computed,]))
+
+
+
+
+
+
+
+
+
+
+# cutoff_feats <- paste0("VV",1:6)
+# testObs_computed <- 5
+
+# full_pred <- predict(model,x_explain)[5]
+# p0 <- mean(y_train)
+# pred_not_to_decompose <- sum(expl$shapley_values[5,VV7:VV9])
+
+
+# run_minor <- iterative_kshap_func(model,x_explain,x_train,
+# testObs_computed = 5,
+# cutoff_feats = cutoff_feats,
+# full_pred = full_pred,
+# pred_not_to_decompose = pred_not_to_decompose,
+# p0 = p0,
+# predict_model = predict.lm,shapley_threshold_val = 0)
+
+
+# aa=run$keep_list[[8]]$dt_vS
+
+# bb=run_minor$keep_list[[6]]$dt_vS
+# setnames(bb,"p_hat_1","p_hat_1_approx")
+
+# cc=merge(aa,bb)
+# cc[,diff:=p_hat_1-p_hat_1_approx]
+
+
+# TODO:
+
+# 1. Run example with gaussian features where the truth is known in advance in a large setting, with e.g. 12 features or so. I want the estimate
+# both for the full 12 features, and for subsets where one is removed.
+# 2.
+
+# Utfordringer:
+# 1. Hvordan justere vekter og samplingrutine fra subset S når man allerede har et utvalg sampler (som også er noe biased).
+# 2. Bruker altså E[f(x1=x1*,x2,x3=x3*,x4)|x1=x1*] som proxy for E[f(x1=x1*,x2,x3=x3*,x4)|x1=x1*,x3=x3*],
+#men hva med E[f(x1=x1*,x2,x3,x4=x4*)|x1=x1*,x4=x4*]? Burde jeg bruke den for
+#E[f(x1=x1*,x2,x3=x3*,x4=x4*)|x1=x1*,x4=x4*]?
+# 3. Når jeg fjerner en variabel (som har lite å si), så settes shapley-verdien til det den har per da. MEN den verdien vil trolig være noe biased fordi den fjernes første gangen den går over terskelverdiene
+# jeg har satt for ekskludering.
+
diff --git a/inst/scripts/devel/simtest_iterative_kernelshap_nonlingauss_analyze_results.R b/inst/scripts/devel/simtest_iterative_kernelshap_nonlingauss_analyze_results.R
new file mode 100644
index 000000000..cc994057e
--- /dev/null
+++ b/inst/scripts/devel/simtest_iterative_kernelshap_nonlingauss_analyze_results.R
@@ -0,0 +1,122 @@
+library(data.table)
+shapley_reweighting_strategy = "none"
+shapley_threshold_val <- 0.2
+
+
+
+sim_results_folder = "/nr/project/stat/BigInsight/Projects/Explanations/EffektivShapley/Frida/simuleringsresultater/sim_nonlingauss_v2/"
+
+load(paste0(sim_results_folder,"iterative_kernelshap_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".RData"))
+
+
+exact_vals = fread(paste0(sim_results_folder,"exact_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+names(exact_vals) <- c("phi0", paste0("VV",1:12))
+iterative_vals = fread(paste0(sim_results_folder,"iterative_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+approx_vals = fread(paste0(sim_results_folder,"approx_shapley_values_", shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(exact_vals - iterative_vals)
+rmse_vec <- sqrt(colMeans((exact_vals - iterative_vals)^2))
+mae_vec <- colMeans(abs(exact_vals - iterative_vals))
+
+bias_vec_approx <- colMeans(exact_vals - approx_vals)
+rmse_vec_approx <- sqrt(colMeans((exact_vals - approx_vals)^2))
+mae_vec_approx <- colMeans(abs(exact_vals - approx_vals))
+
+mean(mae_vec[-1])
+mean(mae_vec_approx[-1])
+
+treeshap_vals <- as.data.table(predict(model,newdata=as.matrix(x_explain),predcontrib = TRUE))
+setnames(treeshap_vals,"BIAS","none")
+setcolorder(treeshap_vals,"none")
+head(treeshap_vals)
+mae_vec_treeshap <- colMeans(abs(exact_vals - treeshap_vals))
+mean(mae_vec_treeshap[-1])
+
+library(ggplot2)
+
+# Create a data frame for the bar plot
+
+# MAE
+df <- data.frame(matrix(0, length(mae_vec)*2, 3))
+colnames(df) <- c("MAE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- mae_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- mae_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+df <- as.data.table(df)
+dt_treeshap <- data.frame(MAE=mae_vec_treeshap,approach="TreeSHAP",features=names(mae_vec_treeshap))
+df <- rbind(df,dt_treeshap)
+
+df[,features:=factor(features,levels=c("phi0",paste0("VV",1:12)))]
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p, width = 10, height = 5)
+
+
+
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p)
+
+
+# RMSE
+df <- data.frame(matrix(0, length(rmse_vec)*2, 3))
+colnames(df) <- c("RMSE", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- rmse_vec_approx
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- rmse_vec
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+df[,features:=factor(features,levels=c("phi0",paste0("VV",1:12)))]
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p, width = 10, height = 5)
+
+
+
+
+
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = RMSE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "rmse_comparison.png"), plot = p)
+
+
+# Bias
+df <- data.frame(matrix(0, length(bias_vec)*2, 3))
+colnames(df) <- c("abs_bias", "approach", "features")
+# rownames(df) <- names(exact_vals)
+df[1:length(exact_vals), 1] <- abs(bias_vec_approx)
+df[1:length(exact_vals), 2] <- rep("approx", length(exact_vals))
+df[(length(exact_vals)+1):nrow(df), 1] <- abs(bias_vec)
+df[(length(exact_vals)+1):nrow(df), 2] <- rep("iterative", length(exact_vals))
+df[, 3] <- rep(names(exact_vals), 2)
+df[,features:=factor(features,levels=c("phi0",paste0("VV",1:12)))]
+
+# Create the bar plot using ggplot
+p <- ggplot(df, aes(x = features, y = MAE, fill = approach)) +
+ geom_col(position = "dodge")
+ggsave(paste(sim_results_folder, "mae_comparison.png"), plot = p, width = 10, height = 5)
+
+
+# Number of sample used
+runcomps_list
+
+df = data.frame(matrix(0, length(runcomps_list), 1))
+colnames(df) <- c("n_rows")
+df$n_rows <- as.numeric(runcomps_list)
+
+p <- ggplot(df, aes(n_rows)) +
+ geom_histogram()
+ggsave(paste0(sim_results_folder, "n_rows.png"), plot = p)
+
diff --git a/inst/scripts/devel/simtest_reweighting_strategies.R b/inst/scripts/devel/simtest_reweighting_strategies.R
new file mode 100644
index 000000000..b1356faa6
--- /dev/null
+++ b/inst/scripts/devel/simtest_reweighting_strategies.R
@@ -0,0 +1,263 @@
+### Upcoming generalization:
+
+#1. Use non-linear truth (xgboost or so)
+#2. Even more features
+
+
+library(data.table)
+library(MASS)
+library(Matrix)
+library(shapr)
+library(future)
+library(xgboost)
+
+m <- 12
+n_train <- 5000
+n_explain <- 5
+rho_1 <- 0.9
+rho_2 <- 0.6
+rho_3 <- 0.3
+rho_4 <- 0.1
+Sigma_1 <- matrix(rho_1, m/4, m/4) + diag(m/4) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/4, m/4) + diag(m/4) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/4, m/4) + diag(m/4) * (1 - rho_3)
+Sigma_4 <- matrix(rho_4, m/4, m/4) + diag(m/4) * (1 - rho_4)
+
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3, Sigma_4))
+mu <- rep(0,m)
+
+set.seed(123)
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+
+beta <- rnorm(m)
+alpha <- 1
+y_train <- as.vector(alpha + as.matrix(x_train) %*% beta + rnorm(n_train, 0, 1))
+y_explain <- alpha + as.matrix(x_explain) %*% beta + rnorm(n_explain, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+set.seed(123)
+
+model <- lm(y_train ~ .,data = xy_train)
+
+p0 <- mean(y_train)
+
+
+### First run proper shapr call on this
+
+shapley_reweighting_strategy = "none"
+
+set.seed(465132)
+progressr::handlers(global = TRUE)
+expl <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_batches=100,n_samples = 10000,
+ prediction_zero = p0,Sigma=Sigma,mu=mu)
+
+dt_vS_map <- merge(expl$internal$iter_list[[1]]$coalition_map,expl$internal$output$dt_vS,by="id_coalition")[,-"id_coalition"]
+
+
+shapley_reweighting_strategy_vec <- c("none","on_N","on_coal_size","on_all","on_all_cond")
+
+n_coalitions_vec <- c(50,100,200,400,800,1200,1600,2000,2400,2800,3200,3600,4000)
+
+reps <- 100
+
+paired_shap_sampling_vec <- c(FALSE,TRUE)
+
+res_list <- list()
+
+for(i0 in seq_along(paired_shap_sampling_vec)){
+
+ for(i in seq_len(reps)){
+
+ for(ii in seq_along(n_coalitions_vec)){
+
+ this_seed <- 1+i
+ this_n_coalitions <- n_coalitions_vec[ii]
+ this_paired_shap_sampling <- paired_shap_sampling_vec[i0]
+
+ this <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_samples = 10, # Never used
+ n_batches=10,
+ prediction_zero = p0,
+ Sigma=Sigma,
+ mu=mu,
+ seed = this_seed,
+ max_n_coalitions = this_n_coalitions,
+ shapley_reweighting = "none",
+ paired_shap_sampling = this_paired_shap_sampling)
+
+ this0_X <- this$internal$objects$X
+
+
+ exact_dt_vS <- merge(this$internal$iter_list[[1]]$coalition_map,dt_vS_map,by="coalitions_str")
+ setorder(exact_dt_vS,id_coalition)
+
+
+ for(iii in seq_along(shapley_reweighting_strategy_vec)){
+ this_shapley_reweighting_strategy <- shapley_reweighting_strategy_vec[iii]
+
+ this_X <- copy(this0_X)
+
+ shapr:::shapley_reweighting(this_X,reweight=this_shapley_reweighting_strategy)
+
+ this_W <- weight_matrix(
+ X = this_X,
+ normalize_W_weights = TRUE
+ )
+
+ shap_dt0 <- as.data.table(cbind(seq_len(n_explain),t(this_W%*%as.matrix(exact_dt_vS[,-c("coalitions_str","id_coalition")]))))
+ names(shap_dt0) <- names(this$shapley_values)
+
+ this_diff <- unlist(shap_dt0[,-c(1,2)]-expl$shapley_values[,-c(1,2)])
+ this_bias <- mean(this_diff)
+ this_var <- var(this_diff)
+ this_MAE <- mean(abs(this_diff))
+ this_RMSE <- sqrt(mean(this_diff^2))
+
+ res_vec <- data.table(n_coalitions = this_n_coalitions,
+ paired_shap_sampling = this_paired_shap_sampling,
+ shapley_reweighting_strategy = this_shapley_reweighting_strategy,
+ seed = this_seed,
+ bias=this_bias,
+ var = this_var,
+ MAE = this_MAE,
+ RMSE = this_RMSE)
+
+ res_list[[length(res_list)+1]] <- copy(res_vec)
+
+ }
+
+ }
+
+ print(i)
+
+ }
+
+}
+
+
+res_dt <- rbindlist(res_list)
+
+fwrite(res_dt,file = "../../Div/extra_shapr_scripts_etc/res_dt_reweighting_sims_lingaus.csv")
+
+resres <- res_dt[,lapply(.SD,mean),.SDcols=c("bias","var","MAE","RMSE"),by=.(paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+library(ggplot2)
+
+ggplot(resres[paired_shap_sampling==TRUE],aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()
+
+
+
+#### OLD ####
+
+### Need to create an lm analogoue to pred_mod_xgb here
+
+
+set.seed(123)
+
+
+
+# These are the parameters for for interative_kshap_func
+n_samples <- 1000
+approach = "gaussian"
+
+# Reduce if < 10% prob of shapval > 0.2
+
+source("inst/scripts/devel/iterative_kernelshap_sourcefuncs.R")
+
+testObs_computed_vec <- inds# seq_len(n_explain)
+
+# Using threshold: 0.1
+runres_list <- runcomps_list <- list()
+for(kk in testObs_computed_vec){
+ testObs_computed <- testObs_computed_vec[kk]
+ full_pred <- predict(model,x_explain)[testObs_computed]
+ shapsum_other_features <- 0
+
+
+ run <- iterative_kshap_func(model,x_explain,x_train,
+ testObs_computed = testObs_computed,
+ cutoff_feats = cutoff_feats,
+ initial_n_combinations = 50,
+ full_pred = full_pred,
+ shapsum_other_features = shapsum_other_features,
+ p0 = p0,
+ predict_model = predict.lm,
+ shapley_threshold_val = shapley_threshold_val,
+ shapley_threshold_prob = shapley_threshold_prob,
+ approach = approach,
+ n_samples = n_samples,
+ gaussian.mu = mu,
+ gaussian.cov_mat = Sigma,
+ shapley_reweighting_strategy = shapley_reweighting_strategy)
+ runres_list[[kk]] <- run$kshap_final
+ runcomps_list[[kk]] <- sum(sapply(run$keep_list,"[[","no_computed_combinations"))
+ print(kk)
+}
+
+est <- rbindlist(runres_list)
+est[,other_features:=NULL]
+fwrite(est,paste0(sim_results_saving_folder,"iterative_shapley_values_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+
+
+
+truth <- expl$shapley_values
+
+expl_approx <- matrix(0, nrow = length(inds), ncol = m+1)
+expl_approx_obj_list <- list()
+for (i in testObs_computed_vec){
+ expl_approx_obj <- shapr::explain(model = model,
+ x_explain= x_explain[inds[i],],
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p0,
+ n_combinations = runcomps_list[[i]],
+ Sigma=Sigma,mu=mu)
+ expl_approx[i,] = unlist(expl_approx_obj$shapley_values)
+ expl_approx_obj_list[[i]] <- expl_approx_obj
+}
+expl_approx <- as.data.table(expl_approx)
+colnames(expl_approx) <- colnames(truth)
+fwrite(expl_approx,paste0(sim_results_saving_folder,"approx_shapley_values_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".csv"))
+
+bias_vec <- colMeans(est-truth)
+rmse_vec <- sqrt(colMeans((est-truth)^2))
+mae_vec <- colMeans(abs(est-truth))
+
+bias_vec_approx <- colMeans(expl_approx-truth)
+rmse_vec_approx <- sqrt(colMeans((expl_approx-truth)^2))
+mae_vec_approx <- colMeans(abs(expl_approx-truth))
+
+save.image(paste0(sim_results_saving_folder, "iterative_kernelshap_",shapley_threshold_val,"_",shapley_reweighting_strategy, ".RData"))
+
+hist(unlist(runcomps_list),breaks = 20)
+
+summary(unlist(runcomps_list))
+
+
+run$kshap_final
+sum(unlist(run$kshap_final))
+full_pred
+
+
+
+
+
+
+
diff --git a/inst/scripts/devel/simtest_reweighting_strategies_nonlinear.R b/inst/scripts/devel/simtest_reweighting_strategies_nonlinear.R
new file mode 100644
index 000000000..5e4d09522
--- /dev/null
+++ b/inst/scripts/devel/simtest_reweighting_strategies_nonlinear.R
@@ -0,0 +1,182 @@
+### Upcoming generalization:
+
+#1. Use non-linear truth (xgboost or so)
+#2. Even more features
+
+
+library(data.table)
+library(MASS)
+library(Matrix)
+library(shapr)
+library(future)
+library(xgboost)
+
+m <- 12
+n_train <- 5000
+n_explain <- 5
+rho_1 <- 0.9
+rho_2 <- 0.6
+rho_3 <- 0.3
+rho_4 <- 0.1
+Sigma_1 <- matrix(rho_1, m/4, m/4) + diag(m/4) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/4, m/4) + diag(m/4) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/4, m/4) + diag(m/4) * (1 - rho_3)
+Sigma_4 <- matrix(rho_4, m/4, m/4) + diag(m/4) * (1 - rho_4)
+
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3, Sigma_4))
+mu <- rep(0,m)
+
+set.seed(123)
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+
+g <- function(a,b){
+ a*b+a*b^2+a^2*b
+}
+
+beta <- c(0.2, -0.8, 1.0, 0.5, -0.8, rep(0, m - 5))
+gamma <- c(0.8,-1)
+alpha <- 1
+y_train <- alpha +
+ as.vector(as.matrix(cos(x_train))%*%beta) +
+ unlist(gamma[1]*g(x_train[,1],x_train[,2])) +
+ unlist(gamma[1]*g(x_train[,3],x_train[,4])) +
+ rnorm(n_train, 0, 1)
+y_explain <- alpha +
+ as.vector(as.matrix(cos(x_explain))%*%beta) +
+ unlist(gamma[1]*g(x_explain[,1],x_explain[,2])) +
+ unlist(gamma[1]*g(x_explain[,3],x_explain[,4])) +
+ rnorm(n_train, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+set.seed(123)
+model <- xgboost(
+ data = as.matrix(x_train),
+ label = y_train,
+ nround = 50,
+ verbose = FALSE
+)
+
+p0 <- mean(y_train)
+
+
+### First run proper shapr call on this
+
+shapley_reweighting_strategy = "none"
+
+set.seed(465132)
+progressr::handlers(global = TRUE)
+expl <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_batches=100,n_samples = 10000,
+ prediction_zero = p0,Sigma=Sigma,mu=mu)
+
+dt_vS_map <- merge(expl$internal$iter_list[[1]]$coalition_map,expl$internal$output$dt_vS,by="id_coalition")[,-"id_coalition"]
+
+
+shapley_reweighting_strategy_vec <- c("none","on_N","on_coal_size","on_all","on_all_cond")
+
+n_coalitions_vec <- c(50,100,200,400,800,1200,1600,2000,2400,2800,3200,3600,4000)
+
+reps <- 100
+
+paired_shap_sampling_vec <- c(FALSE,TRUE)
+
+res_list <- list()
+
+for(i0 in seq_along(paired_shap_sampling_vec)){
+
+ for(i in seq_len(reps)){
+
+ for(ii in seq_along(n_coalitions_vec)){
+
+ this_seed <- 1+i
+ this_n_coalitions <- n_coalitions_vec[ii]
+ this_paired_shap_sampling <- paired_shap_sampling_vec[i0]
+
+ this <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_samples = 10, # Never used
+ n_batches=10,
+ prediction_zero = p0,
+ Sigma=Sigma,
+ mu=mu,
+ seed = this_seed,
+ max_n_coalitions = this_n_coalitions,
+ shapley_reweighting = "none",
+ paired_shap_sampling = this_paired_shap_sampling)
+
+ this0_X <- this$internal$objects$X
+
+
+ exact_dt_vS <- merge(this$internal$iter_list[[1]]$coalition_map,dt_vS_map,by="coalitions_str")
+ setorder(exact_dt_vS,id_coalition)
+
+
+ for(iii in seq_along(shapley_reweighting_strategy_vec)){
+ this_shapley_reweighting_strategy <- shapley_reweighting_strategy_vec[iii]
+
+ this_X <- copy(this0_X)
+
+ shapr:::shapley_reweighting(this_X,reweight=this_shapley_reweighting_strategy)
+
+ this_W <- weight_matrix(
+ X = this_X,
+ normalize_W_weights = TRUE
+ )
+
+ shap_dt0 <- as.data.table(cbind(seq_len(n_explain),t(this_W%*%as.matrix(exact_dt_vS[,-c("coalitions_str","id_coalition")]))))
+ names(shap_dt0) <- names(this$shapley_values)
+
+ this_diff <- unlist(shap_dt0[,-c(1,2)]-expl$shapley_values[,-c(1,2)])
+ this_bias <- mean(this_diff)
+ this_var <- var(this_diff)
+ this_MAE <- mean(abs(this_diff))
+ this_RMSE <- sqrt(mean(this_diff^2))
+
+ res_vec <- data.table(n_coalitions = this_n_coalitions,
+ paired_shap_sampling = this_paired_shap_sampling,
+ shapley_reweighting_strategy = this_shapley_reweighting_strategy,
+ seed = this_seed,
+ bias=this_bias,
+ var = this_var,
+ MAE = this_MAE,
+ RMSE = this_RMSE)
+
+ res_list[[length(res_list)+1]] <- copy(res_vec)
+
+ }
+
+ }
+
+ print(i)
+
+ }
+
+}
+
+
+res_dt <- rbindlist(res_list)
+
+fwrite(res_dt,file = "../../Div/extra_shapr_scripts_etc/res_dt_reweighting_sims_nonlingaus.csv")
+
+resres <- res_dt[,lapply(.SD,mean),.SDcols=c("bias","var","MAE","RMSE"),by=.(paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+library(ggplot2)
+
+ggplot(resres[paired_shap_sampling==TRUE],aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()
+
+ggplot(resres[paired_shap_sampling==FALSE],aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()
diff --git a/inst/scripts/devel/simtest_reweighting_strategies_nonlinear_nonunique_sampling.R b/inst/scripts/devel/simtest_reweighting_strategies_nonlinear_nonunique_sampling.R
new file mode 100644
index 000000000..82c573e90
--- /dev/null
+++ b/inst/scripts/devel/simtest_reweighting_strategies_nonlinear_nonunique_sampling.R
@@ -0,0 +1,217 @@
+### Upcoming generalization:
+
+#1. Use non-linear truth (xgboost or so)
+#2. Even more features
+
+
+library(data.table)
+library(MASS)
+library(Matrix)
+library(shapr)
+library(future)
+library(xgboost)
+
+m <- 12
+n_train <- 5000
+n_explain <- 5
+rho_1 <- 0.9
+rho_2 <- 0.6
+rho_3 <- 0.3
+rho_4 <- 0.1
+Sigma_1 <- matrix(rho_1, m/4, m/4) + diag(m/4) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/4, m/4) + diag(m/4) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/4, m/4) + diag(m/4) * (1 - rho_3)
+Sigma_4 <- matrix(rho_4, m/4, m/4) + diag(m/4) * (1 - rho_4)
+
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3, Sigma_4))
+mu <- rep(0,m)
+
+set.seed(123)
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+
+g <- function(a,b){
+ a*b+a*b^2+a^2*b
+}
+
+beta <- c(0.2, -0.8, 1.0, 0.5, -0.8, rep(0, m - 5))
+gamma <- c(0.8,-1)
+alpha <- 1
+y_train <- alpha +
+ as.vector(as.matrix(cos(x_train))%*%beta) +
+ unlist(gamma[1]*g(x_train[,1],x_train[,2])) +
+ unlist(gamma[1]*g(x_train[,3],x_train[,4])) +
+ rnorm(n_train, 0, 1)
+y_explain <- alpha +
+ as.vector(as.matrix(cos(x_explain))%*%beta) +
+ unlist(gamma[1]*g(x_explain[,1],x_explain[,2])) +
+ unlist(gamma[1]*g(x_explain[,3],x_explain[,4])) +
+ rnorm(n_train, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+set.seed(123)
+model <- xgboost(
+ data = as.matrix(x_train),
+ label = y_train,
+ nround = 50,
+ verbose = FALSE
+)
+
+p0 <- mean(y_train)
+
+
+### First run proper shapr call on this
+
+shapley_reweighting_strategy = "none"
+
+set.seed(465132)
+progressr::handlers(global = TRUE)
+expl <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_batches=100,n_samples = 10000,
+ prediction_zero = p0,Sigma=Sigma,mu=mu)
+
+dt_vS_map <- merge(expl$internal$iter_list[[1]]$coalition_map,expl$internal$output$dt_vS,by="id_coalition")[,-"id_coalition"]
+
+
+shapley_reweighting_strategy_vec <- c("none","on_N","on_coal_size","on_all","on_all_cond","on_all_cond_paired","comb")
+
+n_coalitions_vec <- c(50,100,200,400,800,1200,1600,2000,2400,2800,3200,3600,4000)
+
+reps <- 200
+
+paired_shap_sampling_vec <- c(FALSE,TRUE)
+
+res_list <- weight_list <- list()
+
+for(ii in seq_along(n_coalitions_vec)){
+
+ for(i0 in seq_along(paired_shap_sampling_vec)){
+
+ for(i in seq_len(reps)){
+
+ this_seed <- 10000+1+i
+ this_n_coalitions <- n_coalitions_vec[ii]
+ this_paired_shap_sampling <- paired_shap_sampling_vec[i0]
+
+ this <- shapr::explain(model = model,
+ x_explain= x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ n_samples = 10, # Never used
+ n_batches=10,
+ prediction_zero = p0,
+ Sigma=Sigma,
+ mu=mu,
+ seed = this_seed,
+ max_n_coalitions = this_n_coalitions,
+ shapley_reweighting = "none",
+ unique_sampling = TRUE,
+ paired_shap_sampling = this_paired_shap_sampling)
+
+ this0_X <- this$internal$objects$X
+
+
+ exact_dt_vS <- merge(this$internal$iter_list[[1]]$coalition_map,dt_vS_map,by="coalitions_str")
+ setorder(exact_dt_vS,id_coalition)
+
+
+ for(iii in seq_along(shapley_reweighting_strategy_vec)){
+ this_shapley_reweighting_strategy <- shapley_reweighting_strategy_vec[iii]
+
+ this_X <- copy(this0_X)
+
+ shapr:::shapley_reweighting(this_X,reweight=this_shapley_reweighting_strategy)
+
+ this_W <- weight_matrix(
+ X = this_X,
+ normalize_W_weights = TRUE
+ )
+
+ shap_dt0 <- as.data.table(cbind(seq_len(n_explain),t(this_W%*%as.matrix(exact_dt_vS[,-c("coalitions_str","id_coalition")]))))
+ names(shap_dt0) <- names(this$shapley_values)
+
+ this_diff <- unlist(shap_dt0[,-c(1,2)]-expl$shapley_values[,-c(1,2)])
+ this_bias <- mean(this_diff)
+ this_var <- var(this_diff)
+ this_MAE <- mean(abs(this_diff))
+ this_RMSE <- sqrt(mean(this_diff^2))
+
+ res_vec <- data.table(n_coalitions = this_n_coalitions,
+ paired_shap_sampling = this_paired_shap_sampling,
+ shapley_reweighting_strategy = this_shapley_reweighting_strategy,
+ seed = this_seed,
+ bias=this_bias,
+ var = this_var,
+ MAE = this_MAE,
+ RMSE = this_RMSE)
+
+ res_list[[length(res_list)+1]] <- copy(res_vec)
+
+ # weight_dt <- unique(this_X[,.(coalition_size,shapley_weight)][,shapley_weight:=mean(shapley_weight),by=coalition_size][])
+ weight_dt <- this_X[,.(coalition_size,shapley_weight)][,head(.SD,1),by=coalition_size]
+
+ weight_dt[,n_coalitions:=this_n_coalitions]
+ weight_dt[,paired_shap_sampling:=this_paired_shap_sampling]
+ weight_dt[,shapley_reweighting_strategy:=this_shapley_reweighting_strategy]
+ weight_dt[,seed:=this_seed]
+
+ weight_list[[length(weight_list)+1]] <- copy(weight_dt)
+
+
+ }
+
+ }
+
+ print(i)
+
+ }
+
+ print(n_coalitions_vec[ii])
+}
+
+
+res_dt <- rbindlist(res_list)
+
+fwrite(res_dt,file = "../../Div/extra_shapr_scripts_etc/res_dt_reweighting_sims_nonlingaus_nonunique_sampling_new.csv")
+
+resres <- res_dt[,lapply(.SD,mean),.SDcols=c("bias","var","MAE","RMSE"),by=.(paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+resres_sd <- res_dt[,lapply(.SD,sd),.SDcols=c("bias","var","MAE","RMSE"),by=.(paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+
+library(ggplot2)
+
+ggplot(resres,aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()
+
+ggplot(resres[paired_shap_sampling==FALSE],aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()+scale_y_log10()
+
+ggplot(resres[paired_shap_sampling==TRUE],aes(x=n_coalitions,y=MAE,col=shapley_reweighting_strategy,linetype= paired_shap_sampling))+
+ geom_line()+scale_y_log10()
+
+
+
+
+weight_dt <- rbindlist(weight_list)
+
+
+weight_dt[!(coalition_size%in%c(0,12)),sum_shapley_weight:=sum(shapley_weight),by=.(seed,paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+weight_dt[!(coalition_size%in%c(0,12)),shapley_weight:=shapley_weight/sum_shapley_weight]
+weight_dt[!(coalition_size%in%c(0,12)),mean(shapley_weight),by=.(seed,paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+
+ww_dt <- weight_dt[!(coalition_size%in%c(0,12)),list(mean_weight=mean(shapley_weight)),by=.(coalition_size,paired_shap_sampling,n_coalitions,shapley_reweighting_strategy)]
+
+ggplot(ww_dt[paired_shap_sampling==TRUE & shapley_reweighting_strategy %in% c("none","on_all_cond_paired","on_N")],aes(x=coalition_size,y=mean_weight,col=shapley_reweighting_strategy))+
+ geom_point()+facet_grid(~n_coalitions)
diff --git a/inst/scripts/devel/simtest_timing_to_Frida.R b/inst/scripts/devel/simtest_timing_to_Frida.R
new file mode 100644
index 000000000..d6366ce16
--- /dev/null
+++ b/inst/scripts/devel/simtest_timing_to_Frida.R
@@ -0,0 +1,107 @@
+library(data.table)
+library(MASS)
+library(Matrix)
+library(shapr)
+library(future)
+library(xgboost)
+
+shapley_threshold_prob <- 0.2
+shapley_threshold_val <- 0.1
+
+m <- 12
+n_train <- 5000
+n_explain <- 100
+rho_1 <- 0.5
+rho_2 <- 0.5
+rho_3 <- 0.5
+rho_4 <- 0
+Sigma_1 <- matrix(rho_1, m/4, m/4) + diag(m/4) * (1 - rho_1)
+Sigma_2 <- matrix(rho_2, m/4, m/4) + diag(m/4) * (1 - rho_2)
+Sigma_3 <- matrix(rho_3, m/4, m/4) + diag(m/4) * (1 - rho_3)
+Sigma_4 <- matrix(rho_4, m/4, m/4) + diag(m/4) * (1 - rho_4)
+
+Sigma <- as.matrix(bdiag(Sigma_1, Sigma_2, Sigma_3, Sigma_4))
+mu <- rep(0,m)
+
+set.seed(123)
+
+
+x_train <- as.data.table(MASS::mvrnorm(n_train,mu,Sigma))
+x_explain <- as.data.table(MASS::mvrnorm(n_explain,mu,Sigma))
+
+names(x_train) <- paste0("VV",1:m)
+names(x_explain) <- paste0("VV",1:m)
+
+
+g <- function(a,b){
+ a*b+a*b^2+a^2*b
+}
+
+beta <- c(0.2, -0.8, 1.0, 0.5, -0.8, rep(0, m - 5))
+gamma <- c(0.8,-1)
+alpha <- 1
+y_train <- alpha +
+ as.vector(as.matrix(cos(x_train))%*%beta) +
+ unlist(gamma[1]*g(x_train[,1],x_train[,2])) +
+ unlist(gamma[1]*g(x_train[,3],x_train[,4])) +
+ rnorm(n_train, 0, 1)
+y_explain <- alpha +
+ as.vector(as.matrix(cos(x_explain))%*%beta) +
+ unlist(gamma[1]*g(x_explain[,1],x_explain[,2])) +
+ unlist(gamma[1]*g(x_explain[,3],x_explain[,4])) +
+ rnorm(n_train, 0, 1)
+
+xy_train <- cbind(y_train, x_train)
+
+set.seed(123)
+model <- xgboost(
+ data = as.matrix(x_train),
+ label = y_train,
+ nround = 50,
+ verbose = FALSE
+)
+
+pred_train <- predict(model, as.matrix(x_train))
+
+this_order <- order(unlist(x_train[,1]))
+
+plot(unlist(x_train[this_order,1]),pred_train[this_order],type="l")
+
+p0 <- mean(y_train)
+
+
+### First run proper shapr call on this
+
+
+set.seed(465132)
+inds = 1:5#1:n_explain
+
+expl <- explain(
+ model = model,
+ x_explain= x_explain[inds,],
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p0,
+ n_coalitions = 100,
+ Sigma=Sigma,
+ mu=mu,
+ adaptive = TRUE,
+ unique_sampling = FALSE,
+ adaptive_arguments = list(initial_n_coalitions = 50,
+ fixed_n_coalitions_per_iter = 50,
+ max_iter = 10,
+ convergence_tolerance = 10^(-10),
+ compute_sd = TRUE),
+ shapley_reweighting = "none",
+ print_iter_info = TRUE
+)
+
+# Number of (non-unique) coalitions per iteration
+sapply(expl$internal$iter_list,function(dt) dt$X[,sum(sample_freq)])
+
+# Timing of main function call
+expl$timing$main_timing_secs
+
+# Timings per iteration
+expl$timing$iter_timing_secs_dt[]
+
diff --git a/inst/scripts/devel/testing_explain_forevast_n_comb.R b/inst/scripts/devel/testing_explain_forevast_n_comb.R
index 48784a6cf..23517f2c9 100644
--- a/inst/scripts/devel/testing_explain_forevast_n_comb.R
+++ b/inst/scripts/devel/testing_explain_forevast_n_comb.R
@@ -14,7 +14,7 @@ h3test <- explain_forecast(model = model_arima_temp,
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = 300
+ n_coalitions = 300
)
h2test <- explain_forecast(model = model_arima_temp,
@@ -31,7 +31,7 @@ h2test <- explain_forecast(model = model_arima_temp,
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = 10^7
+ n_coalitions = 10^7
)
h1test <- explain_forecast(model = model_arima_temp,
@@ -48,7 +48,7 @@ h1test <- explain_forecast(model = model_arima_temp,
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = 10^7
+ n_coalitions = 10^7
)
w <- h3test$internal$objects$X_list[[1]][["shapley_weight"]]
@@ -127,7 +127,7 @@ for (i in 1:reps){
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = ncomb
+ n_coalitions = ncomb
)
h2list[[i]] <- explain_forecast(model = model_arima_temp,
@@ -144,7 +144,7 @@ for (i in 1:reps){
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = ncomb
+ n_coalitions = ncomb
)
h1list[[i]] <- explain_forecast(model = model_arima_temp,
@@ -161,7 +161,7 @@ for (i in 1:reps){
n_batches = 1,
timing = FALSE,
seed = i,
- n_combinations = min(ncomb,31)
+ n_coalitions = min(ncomb,31)
)
print(i)
diff --git a/inst/scripts/devel/testing_for_valid_defualt_n_batches.R b/inst/scripts/devel/testing_for_valid_defualt_n_batches.R
index 2c5f3ef09..a9d5739fd 100644
--- a/inst/scripts/devel/testing_for_valid_defualt_n_batches.R
+++ b/inst/scripts/devel/testing_for_valid_defualt_n_batches.R
@@ -1,10 +1,10 @@
# In this code we demonstrate that (before the bugfix) the `explain()` function
-# does not enter the exact mode when n_combinations is larger than or equal to 2^m.
-# The mode is only changed if n_combinations is strictly larger than 2^m.
-# This means that we end up with using all coalitions when n_combinations is 2^m,
+# does not enter the exact mode when n_coalitions is larger than or equal to 2^m.
+# The mode is only changed if n_coalitions is strictly larger than 2^m.
+# This means that we end up with using all coalitions when n_coalitions is 2^m,
# but use not the exact Shapley kernel weights.
# Bugfix replaces `>` with `=>`in the places where the code tests if
-# n_combinations is larger than or equal to 2^m. Then the text/messages printed by
+# n_coalitions is larger than or equal to 2^m. Then the text/messages printed by
# shapr and the code correspond.
library(xgboost)
@@ -34,13 +34,13 @@ model <- xgboost::xgboost(
p0 <- mean(y_train)
# Shapr sets the default number of batches to be 10 for this dataset for the
-# "ctree", "gaussian", and "copula" approaches. Thus, setting `n_combinations`
+# "ctree", "gaussian", and "copula" approaches. Thus, setting `n_coalitions`
# to any value lower of equal to 10 causes the error.
any_number_equal_or_below_10 = 8
# Before the bugfix, shapr:::check_n_batches() throws the error:
# Error in check_n_batches(internal) :
-# `n_batches` (10) must be smaller than the number feature combinations/`n_combinations` (8)
+# `n_batches` (10) must be smaller than the number feature combinations/`n_coalitions` (8)
# Bug only occures for "ctree", "gaussian", and "copula" as they are treated different in
# `get_default_n_batches()`, I am not certain why. Ask Martin about the logic behind that.
explanation <- explain(
@@ -50,5 +50,5 @@ explanation <- explain(
n_samples = 2, # Low value for fast computations
approach = "gaussian",
prediction_zero = p0,
- n_combinations = any_number_equal_or_below_10
+ n_coalitions = any_number_equal_or_below_10
)
diff --git a/inst/scripts/devel/testing_intermediate_saving.R b/inst/scripts/devel/testing_intermediate_saving.R
new file mode 100644
index 000000000..81bb9149b
--- /dev/null
+++ b/inst/scripts/devel/testing_intermediate_saving.R
@@ -0,0 +1,132 @@
+
+
+aa = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.01,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 30
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,shapley_reweighting = "on_N"
+)
+
+bb = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.001,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 30
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,shapley_reweighting = "on_N",prev_shapr_object = aa
+)
+
+
+
+
+##### Reproducable results setting seed outside, and not setting it inside of explain (+ an seed-independent approach)
+# Add something like this
+
+
+set.seed(123)
+full = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.001,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 7
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,
+ shapley_reweighting = "on_N",
+ seed=NULL
+)
+
+set.seed(123)
+first = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.001,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 4
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,
+ shapley_reweighting = "on_N",
+ seed=NULL
+)
+
+
+second = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.001,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 7
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,
+ shapley_reweighting = "on_N",
+ seed=NULL,
+ prev_shapr_object = first
+)
+
+
+
+# This cannot be tested, I think.
+second_path = explain(
+ model = model_lm_numeric,
+ x_explain = x_explain_numeric,
+ x_train = x_train_numeric,
+ approach = "independence",
+ prediction_zero = p0,
+ adaptive_arguments = list(
+ initial_n_coalitions = 10,
+ convergence_tolerance = 0.001,
+ reduction_factor_vec = rep(10^(-5), 10),
+ max_iter = 5
+ ),
+ adaptive = TRUE,
+ print_shapleyres = TRUE,
+ print_iter_info = TRUE,
+ shapley_reweighting = "on_N",
+ seed=NULL,
+ prev_shapr_object = first$internal$parameters$adaptive_arguments$saving_path
+)
+
+
+# Identical results
+all.equal(full$shapley_values,second$shapley_values) # TRUE
+all.equal(full$shapley_values,second2$shapley_values) # TRUE
+all.equal(full$shapley_values,second_path$shapley_values) # TRUE
diff --git a/inst/scripts/devel/testing_memory_monitoring.R b/inst/scripts/devel/testing_memory_monitoring.R
index a372c6cf3..90af5c872 100644
--- a/inst/scripts/devel/testing_memory_monitoring.R
+++ b/inst/scripts/devel/testing_memory_monitoring.R
@@ -44,7 +44,7 @@ xy_train <- cbind(x_train,y=y_train)
model <- lm(formula = y~.,data=xy_train)
-explainer <- shapr(x_train, model,n_combinations = 1000)
+explainer <- shapr(x_train, model,n_coalitions = 1000)
p <- mean(y_train)
diff --git a/inst/scripts/devel/testing_n_cobinations_equal_2_power_m.R b/inst/scripts/devel/testing_n_cobinations_equal_2_power_m.R
index 56e447dee..980597b6f 100644
--- a/inst/scripts/devel/testing_n_cobinations_equal_2_power_m.R
+++ b/inst/scripts/devel/testing_n_cobinations_equal_2_power_m.R
@@ -1,10 +1,10 @@
# In this code we demonstrate that (before the bugfix) the `explain()` function
-# does not enter the exact mode when n_combinations is larger than or equal to 2^m.
-# The mode is only changed if n_combinations is strictly larger than 2^m.
-# This means that we end up with using all coalitions when n_combinations is 2^m,
+# does not enter the exact mode when n_coalitions is larger than or equal to 2^m.
+# The mode is only changed if n_coalitions is strictly larger than 2^m.
+# This means that we end up with using all coalitions when n_coalitions is 2^m,
# but use not the exact Shapley kernel weights.
# Bugfix replaces `>` with `=>`in the places where the code tests if
-# n_combinations is larger than or equal to 2^m. Then the text/messages printed by
+# n_coalitions is larger than or equal to 2^m. Then the text/messages printed by
# shapr and the code correspond.
library(xgboost)
@@ -42,7 +42,7 @@ explanation_exact <- explain(
n_batches = 1, # Not related to the bug
approach = "gaussian",
prediction_zero = p0,
- n_combinations = NULL
+ n_coalitions = NULL
)
# Computing the conditional Shapley values using the gaussian approach
@@ -54,12 +54,12 @@ explanation_should_also_be_exact <- explain(
n_batches = 1, # Not related to the bug
approach = "gaussian",
prediction_zero = p0,
- n_combinations = 2^ncol(x_explain)
+ n_coalitions = 2^ncol(x_explain)
)
# see that both `explain()` objects have the same number of combinations
-explanation_exact$internal$parameters$n_combinations
-explanation_should_also_be_exact$internal$parameters$n_combinations
+explanation_exact$internal$parameters$n_coalitions
+explanation_should_also_be_exact$internal$parameters$n_coalitions
# But the first one of them is exact and the other not.
explanation_exact$internal$parameters$exact
diff --git a/inst/scripts/devel/testing_verification_ar_model.R b/inst/scripts/devel/testing_verification_ar_model.R
index ab5c43d6a..0271bb62f 100644
--- a/inst/scripts/devel/testing_verification_ar_model.R
+++ b/inst/scripts/devel/testing_verification_ar_model.R
@@ -32,7 +32,7 @@ exp <- explain_forecast(model = model_arima_temp,
group_lags = FALSE,
n_batches = 1,
timing = FALSE,
- n_combinations = 50
+ n_coalitions = 50
)
diff --git a/inst/scripts/empirical_memory_testing2.R b/inst/scripts/empirical_memory_testing2.R
index ca57a8d5f..678883b4e 100644
--- a/inst/scripts/empirical_memory_testing2.R
+++ b/inst/scripts/empirical_memory_testing2.R
@@ -100,7 +100,7 @@ internal <- setup(
x_explain = x_explain,
approach = approach,
prediction_zero = prediction_zero,
- n_combinations = 2^p,
+ n_coalitions = 2^p,
group = NULL,
n_samples = 1e3,
n_batches = n_batches_use,
diff --git a/inst/scripts/example_plot_MSEv.R b/inst/scripts/example_plot_MSEv.R
index 42587ccbd..05381b580 100644
--- a/inst/scripts/example_plot_MSEv.R
+++ b/inst/scripts/example_plot_MSEv.R
@@ -228,7 +228,7 @@ plot_MSEv_eval_crit(explanation_list_named,
)$MSEv_explicand_bar
plot_MSEv_eval_crit(explanation_list_named,
plot_type = "comb",
- id_combination = c(3, 4, 9, 13:15)
+ id_coalition = c(3, 4, 9, 13:15)
)$MSEv_combination_bar
@@ -236,11 +236,11 @@ plot_MSEv_eval_crit(explanation_list_named,
MSEv_combination <- plot_MSEv_eval_crit(
explanation_list_named,
plot_type = "comb",
- id_combination = c(3, 4, 9, 13:15)
+ id_coalition = c(3, 4, 9, 13:15)
)$MSEv_combination_bar
MSEv_combination$data$Method <- factor(MSEv_combination$data$Method, levels = rev(levels(MSEv_combination$data$Method)))
MSEv_combination +
- ggplot2::scale_x_discrete(limits = rev(unique(MSEv_combination$data$id_combination))) +
+ ggplot2::scale_x_discrete(limits = rev(unique(MSEv_combination$data$id_coalition))) +
ggplot2::scale_fill_discrete(breaks = rev(levels(MSEv_combination$data$Method)), direction = -1) +
ggplot2::coord_flip()
@@ -249,14 +249,14 @@ MSEv_combination +
MSEv_combination_wo_CI <- plot_MSEv_eval_crit(
explanation_list_named,
plot_type = "comb",
- id_combination = c(3, 4, 9, 13:15),
+ id_coalition = c(3, 4, 9, 13:15),
CI_level = NULL
)$MSEv_combination_bar
MSEv_combination_wo_CI$data$Method <- factor(MSEv_combination_wo_CI$data$Method,
levels = rev(levels(MSEv_combination_wo_CI$data$Method))
)
MSEv_combination_wo_CI +
- ggplot2::scale_x_discrete(limits = rev(unique(MSEv_combination_wo_CI$data$id_combination))) +
+ ggplot2::scale_x_discrete(limits = rev(unique(MSEv_combination_wo_CI$data$id_coalition))) +
ggplot2::scale_fill_brewer(
breaks = rev(levels(MSEv_combination_wo_CI$data$Method)),
palette = "Paired",
@@ -292,7 +292,7 @@ explanation_gaussian_seed_1 <- explain(
approach = "gaussian",
prediction_zero = prediction_zero,
n_samples = 10,
- n_combinations = 10,
+ n_coalitions = 10,
seed = 1
)
@@ -303,7 +303,7 @@ explanation_gaussian_seed_1_V2 <- explain(
approach = "gaussian",
prediction_zero = prediction_zero,
n_samples = 10,
- n_combinations = 10,
+ n_coalitions = 10,
seed = 1
)
@@ -314,7 +314,7 @@ explanation_gaussian_seed_2 <- explain(
approach = "gaussian",
prediction_zero = prediction_zero,
n_samples = 10,
- n_combinations = 10,
+ n_coalitions = 10,
seed = 2
)
@@ -325,7 +325,7 @@ explanation_gaussian_seed_3 <- explain(
approach = "gaussian",
prediction_zero = prediction_zero,
n_samples = 10,
- n_combinations = 10,
+ n_coalitions = 10,
seed = 3
)
diff --git a/inst/scripts/testing_samling_ncombinations.R b/inst/scripts/testing_samling_ncombinations.R
index 65e066d98..1d4fdb36f 100644
--- a/inst/scripts/testing_samling_ncombinations.R
+++ b/inst/scripts/testing_samling_ncombinations.R
@@ -5,12 +5,12 @@ library(shapr)
library(data.table)
n = c(100, 1000, 2000)
p = c(5, 10, 10)
-n_combinations = c(20, 800, 800)
+n_coalitions = c(20, 800, 800)
res = list()
for (i in seq_along(n)) {
set.seed(123)
- cat("n =", n[i], "p =", p[i], "n_combinations =", n_combinations[i], "\n")
+ cat("n =", n[i], "p =", p[i], "n_coalitions =", n_coalitions[i], "\n")
x_train = data.table(matrix(rnorm(n[i]*p[i]), nrow = n[i], ncol = p[i]))
x_test = data.table(matrix(rnorm(10*p[i]), nrow = 10, ncol = p[i]))
beta = rnorm(p[i])
@@ -26,7 +26,7 @@ for (i in seq_along(n)) {
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = n_combinations[i]
+ n_coalitions = n_coalitions[i]
)
)
}
@@ -37,7 +37,7 @@ for (i in seq_along(n)) {
set.seed(123)
- cat("n =", n[i], "p =", p[i], "n_combinations =", n_combinations[i], "\n")
+ cat("n =", n[i], "p =", p[i], "n_coalitions =", n_coalitions[i], "\n")
x_train = data.table(matrix(rnorm(n[i] * p[i]), nrow = n[i], ncol = p[i]))
x_test = data.table(matrix(rnorm(10 * p[i]), nrow = 10, ncol = p[i]))
beta = rnorm(p[i])
@@ -53,7 +53,7 @@ for (i in seq_along(n)) {
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = n_combinations[i]
+ n_coalitions = n_coalitions[i]
)
)
}
@@ -65,7 +65,7 @@ saveRDS(res2, "inst/scripts/testing_samling_ncombinations2.rds")
i = 2
set.seed(123)
-cat("n =", n[i], "p =", p[i], "n_combinations =", n_combinations[i], "\n")
+cat("n =", n[i], "p =", p[i], "n_coalitions =", n_coalitions[i], "\n")
x_train = data.table(matrix(rnorm(n[i] * p[i]), nrow = n[i], ncol = p[i]))
x_test = data.table(matrix(rnorm(10 * p[i]), nrow = 10, ncol = p[i]))
beta = rnorm(p[i])
@@ -80,7 +80,7 @@ system.time({res = explain(
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = 1000
+ n_coalitions = 1000
)})
devtools::load_all()
@@ -90,7 +90,7 @@ system.time({res2 = explain(
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = 800
+ n_coalitions = 800
)})
@@ -101,7 +101,7 @@ system.time({res3 = explain(
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = NULL
+ n_coalitions = NULL
)})
x2 = Sys.time()
@@ -118,7 +118,7 @@ res = profvis({res = explain(
model = model,
approach = "empirical",
prediction_zero = p_mean,
- n_combinations = n_combinations[i]
+ n_coalitions = n_coalitions[i]
)})
res
diff --git a/inst/scripts/timing_script_2023.R b/inst/scripts/timing_script_2023.R
index d43db74f6..71a25c1c7 100644
--- a/inst/scripts/timing_script_2023.R
+++ b/inst/scripts/timing_script_2023.R
@@ -73,7 +73,7 @@ explanation <- explain(
approach = approach,
n_batches = n_batches_use,
prediction_zero = prediction_zero,
- n_combinations = 10^4
+ n_coalitions = 10^4
)
sys_time_end_explain <- Sys.time()
@@ -89,7 +89,7 @@ timing <- list(p = p,
n_batches = n_batches,
n_cores = n_cores,
approach = approach,
- n_combinations = explanation$internal$parameters$used_n_combinations,
+ n_coalitions = explanation$internal$parameters$used_n_coalitions,
sys_time_initial = as.character(sys_time_initial),
sys_time_start_explain = as.character(sys_time_start_explain),
sys_time_end_explain = as.character(sys_time_end_explain),
diff --git a/man/additional_regression_setup.Rd b/man/additional_regression_setup.Rd
new file mode 100644
index 000000000..9aebdd035
--- /dev/null
+++ b/man/additional_regression_setup.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/setup.R
+\name{additional_regression_setup}
+\alias{additional_regression_setup}
+\title{Additional setup for regression-based methods}
+\usage{
+additional_regression_setup(internal, model, predict_model)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Additional setup for regression-based methods
+}
+\keyword{internal}
diff --git a/man/check_convergence.Rd b/man/check_convergence.Rd
new file mode 100644
index 000000000..8d727207a
--- /dev/null
+++ b/man/check_convergence.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/check_convergence.R
+\name{check_convergence}
+\alias{check_convergence}
+\title{Checks the convergence according to the convergence threshold}
+\usage{
+check_convergence(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Checks the convergence according to the convergence threshold
+}
+\keyword{internal}
diff --git a/man/check_verbose.Rd b/man/check_verbose.Rd
new file mode 100644
index 000000000..6300f3210
--- /dev/null
+++ b/man/check_verbose.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/setup.R
+\name{check_verbose}
+\alias{check_verbose}
+\title{Function that checks the verbose parameter}
+\usage{
+check_verbose(verbose)
+}
+\arguments{
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
+}
+\value{
+The function does not return anything.
+}
+\description{
+Function that checks the verbose parameter
+}
+\author{
+Lars Henry Berge Olsen, Martin Jullum
+}
+\keyword{internal}
diff --git a/man/coalition_matrix_cpp.Rd b/man/coalition_matrix_cpp.Rd
new file mode 100644
index 000000000..5f5956e11
--- /dev/null
+++ b/man/coalition_matrix_cpp.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{coalition_matrix_cpp}
+\alias{coalition_matrix_cpp}
+\title{Get coalition matrix}
+\usage{
+coalition_matrix_cpp(coalitions, m)
+}
+\arguments{
+\item{coalitions}{List}
+
+\item{m}{Positive integer. Total number of coalitions}
+}
+\value{
+Matrix
+}
+\description{
+Get coalition matrix
+}
+\author{
+Nikolai Sellereite, Martin Jullum
+}
+\keyword{internal}
diff --git a/man/compute_MSEv_eval_crit.Rd b/man/compute_MSEv_eval_crit.Rd
index c6e3e0549..309f50e56 100644
--- a/man/compute_MSEv_eval_crit.Rd
+++ b/man/compute_MSEv_eval_crit.Rd
@@ -14,38 +14,40 @@ compute_MSEv_eval_crit(
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
-\item{dt_vS}{Data.table of dimension \code{n_combinations} times \code{n_explain + 1} containing the contribution function
-estimates. The first column is assumed to be named \code{id_combination} and containing the ids of the combinations.
-The last row is assumed to be the full combination, i.e., it contains the predicted responses for the observations
+\item{dt_vS}{Data.table of dimension \code{n_coalitions} times \code{n_explain + 1} containing the contribution function
+estimates. The first column is assumed to be named \code{id_coalition} and containing the ids of the coalitions.
+The last row is assumed to be the full coalition, i.e., it contains the predicted responses for the observations
which are to be explained.}
-\item{MSEv_uniform_comb_weights}{Logical. If \code{TRUE} (default), then the function weights the combinations
-uniformly when computing the MSEv criterion. If \code{FALSE}, then the function use the Shapley kernel weights to
-weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the
-sampling frequency when not all combinations are considered.}
+\item{MSEv_uniform_comb_weights}{Logical.
+If \code{TRUE} (default), then the function weights the coalitions uniformly when computing the MSEv criterion.
+If \code{FALSE}, then the function use the Shapley kernel weights to weight the coalitions when computing the MSEv
+criterion.
+Note that the Shapley kernel weights are replaced by the sampling frequency when not all coalitions are considered.}
\item{MSEv_skip_empty_full_comb}{Logical. If \code{TRUE} (default), we exclude the empty and grand
-combinations/coalitions when computing the MSEv evaluation criterion. This is reasonable as they are identical
+coalitions when computing the MSEv evaluation criterion. This is reasonable as they are identical
for all methods, i.e., their contribution function is independent of the used method as they are special cases not
-effected by the used method. If \code{FALSE}, we include the empty and grand combinations/coalitions. In this situation,
+effected by the used method. If \code{FALSE}, we include the empty and grand coalitions. In this situation,
we also recommend setting \code{MSEv_uniform_comb_weights = TRUE}, as otherwise the large weights for the empty and
-grand combinations/coalitions will outweigh all other combinations and make the MSEv criterion uninformative.}
+grand coalitions will outweigh all other coalitions and make the MSEv criterion uninformative.}
}
\value{
List containing:
\describe{
\item{\code{MSEv}}{A \code{\link[data.table]{data.table}} with the overall MSEv evaluation criterion averaged
-over both the combinations/coalitions and observations/explicands. The \code{\link[data.table]{data.table}}
-also contains the standard deviation of the MSEv values for each explicand (only averaged over the combinations)
+over both the coalitions and observations/explicands. The \code{\link[data.table]{data.table}}
+also contains the standard deviation of the MSEv values for each explicand (only averaged over the coalitions)
divided by the square root of the number of explicands.}
\item{\code{MSEv_explicand}}{A \code{\link[data.table]{data.table}} with the mean squared error for each
-explicand, i.e., only averaged over the combinations/coalitions.}
-\item{\code{MSEv_combination}}{A \code{\link[data.table]{data.table}} with the mean squared error for each
-combination/coalition, i.e., only averaged over the explicands/observations.
+explicand, i.e., only averaged over the coalitions.}
+\item{\code{MSEv_coalition}}{A \code{\link[data.table]{data.table}} with the mean squared error for each
+coalition, i.e., only averaged over the explicands/observations.
The \code{\link[data.table]{data.table}} also contains the standard deviation of the MSEv values for
-each combination divided by the square root of the number of explicands.}
+each coalition divided by the square root of the number of explicands.}
}
}
\description{
diff --git a/man/compute_estimates.Rd b/man/compute_estimates.Rd
new file mode 100644
index 000000000..9d708f738
--- /dev/null
+++ b/man/compute_estimates.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_estimates.R
+\name{compute_estimates}
+\alias{compute_estimates}
+\title{Computes the the Shapley values and their standard deviation given the \code{v(S)}}
+\usage{
+compute_estimates(internal, vS_list)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+
+\item{vS_list}{List
+Output from \code{\link[=compute_vS]{compute_vS()}}}
+}
+\description{
+Computes the the Shapley values and their standard deviation given the \code{v(S)}
+}
+\keyword{internal}
diff --git a/man/compute_shapley_new.Rd b/man/compute_shapley_new.Rd
index 14e77306d..6e3a5ce6f 100644
--- a/man/compute_shapley_new.Rd
+++ b/man/compute_shapley_new.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/finalize_explanation.R
+% Please edit documentation in R/compute_estimates.R
\name{compute_shapley_new}
\alias{compute_shapley_new}
\title{Compute shapley values}
@@ -9,7 +9,8 @@ compute_shapley_new(internal, dt_vS)
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
\item{dt_vS}{The contribution matrix.}
}
diff --git a/man/compute_time.Rd b/man/compute_time.Rd
new file mode 100644
index 000000000..e6539e5e4
--- /dev/null
+++ b/man/compute_time.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/timing.R
+\name{compute_time}
+\alias{compute_time}
+\title{Gathers and computes the timing of the different parts of the explain function.}
+\usage{
+compute_time(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Gathers and computes the timing of the different parts of the explain function.
+}
+\keyword{internal}
diff --git a/man/compute_vS.Rd b/man/compute_vS.Rd
index 1988ef5c5..1f8a69e0d 100644
--- a/man/compute_vS.Rd
+++ b/man/compute_vS.Rd
@@ -8,8 +8,7 @@ compute_vS(internal, model, predict_model, method = "future")
}
\arguments{
\item{internal}{List.
-Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
\item{model}{Objects.
The model object that ought to be explained.
@@ -20,8 +19,10 @@ The prediction function used when \code{model} is not natively supported.
See the documentation of \code{\link[=explain]{explain()}} for details.}
\item{method}{Character
-Indicates whether the lappy method (default) or loop method should be used.}
+Indicates whether the lappy method (default) or loop method should be used.
+This is only used for testing purposes.}
}
\description{
Computes \code{v(S)} for all features subsets \code{S}.
}
+\keyword{internal}
diff --git a/man/compute_vS_forecast.Rd b/man/compute_vS_forecast.Rd
new file mode 100644
index 000000000..0b6d25633
--- /dev/null
+++ b/man/compute_vS_forecast.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_vS.R
+\name{compute_vS_forecast}
+\alias{compute_vS_forecast}
+\title{Computes \code{v(S)} for all features subsets \code{S}.}
+\usage{
+compute_vS_forecast(internal, model, predict_model, method = "future")
+}
+\arguments{
+\item{internal}{List.
+Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
+
+\item{model}{Objects.
+The model object that ought to be explained.
+See the documentation of \code{\link[=explain]{explain()}} for details.}
+
+\item{predict_model}{Function.
+The prediction function used when \code{model} is not natively supported.
+See the documentation of \code{\link[=explain]{explain()}} for details.}
+
+\item{method}{Character
+Indicates whether the lappy method (default) or loop method should be used.}
+}
+\description{
+Computes \code{v(S)} for all features subsets \code{S}.
+}
diff --git a/man/create_coalition_table.Rd b/man/create_coalition_table.Rd
new file mode 100644
index 000000000..dd0b12723
--- /dev/null
+++ b/man/create_coalition_table.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/shapley_setup.R
+\name{create_coalition_table}
+\alias{create_coalition_table}
+\title{Define coalitions, and fetch additional information about each unique coalition}
+\usage{
+create_coalition_table(
+ m,
+ exact = TRUE,
+ n_coalitions = 200,
+ weight_zero_m = 10^6,
+ paired_shap_sampling = TRUE,
+ prev_coal_samples = NULL,
+ coal_feature_list = as.list(seq_len(m)),
+ approach0 = "gaussian",
+ shapley_reweighting = "none"
+)
+}
+\arguments{
+\item{m}{Positive integer.
+Total number of features/groups.}
+
+\item{exact}{Logical.
+If \code{TRUE} all \code{2^m} coalitions are generated, otherwise a subsample of the coalitions is used.}
+
+\item{n_coalitions}{Positive integer.
+Note that if \code{exact = TRUE}, \code{n_coalitions} is ignored.}
+
+\item{weight_zero_m}{Numeric.
+The value to use as a replacement for infinite coalition weights when doing numerical operations.}
+
+\item{paired_shap_sampling}{Logical.
+Whether to do paired sampling of coalitions.}
+
+\item{prev_coal_samples}{List.
+A list of previously sampled coalitions.}
+
+\item{coal_feature_list}{List.
+A list mapping each coalition to the features it contains.}
+
+\item{approach0}{Character vector.
+Contains the approach to be used for eastimation of each coalition size. Same as \code{approach} in \code{explain()}.}
+
+\item{shapley_reweighting}{String.
+How to reweight the sampling frequency weights in the kernelSHAP solution after sampling, with the aim of reducing
+the randomness and thereby the variance of the Shapley value estimates.
+One of \code{'none'}, \code{'on_N'}, \code{'on_all'}, \code{'on_all_cond'} (default).
+\code{'none'} means no reweighting, i.e. the sampling frequency weights are used as is.
+\code{'on_coal_size'} means the sampling frequencies are averaged over all coalitions of the same size.
+\code{'on_N'} means the sampling frequencies are averaged over all coalitions with the same original sampling
+probabilities.
+\code{'on_all'} means the original sampling probabilities are used for all coalitions.
+\code{'on_all_cond'} means the original sampling probabilities are used for all coalitions, while adjusting for the
+probability that they are sampled at least once.
+This method is preferred as it has performed the best in simulation studies.}
+}
+\value{
+A data.table with columns about the that contains the following columns:
+}
+\description{
+Define coalitions, and fetch additional information about each unique coalition
+}
+\examples{
+# All coalitions
+x <- create_coalition_table(m = 3)
+nrow(x) # Equals 2^3 = 8
+
+# Subsample of coalitions
+x <- create_coalition_table(exact = FALSE, m = 10, n_coalitions = 1e2)
+}
+\author{
+Nikolai Sellereite, Martin Jullum
+}
diff --git a/man/create_ctree.Rd b/man/create_ctree.Rd
index 3c3db21f6..a85d8871f 100644
--- a/man/create_ctree.Rd
+++ b/man/create_ctree.Rd
@@ -27,7 +27,7 @@ Determines minimum value that the sum of the left and right daughter nodes requi
\item{minbucket}{Numeric scalar. (default = 7)
Determines the minimum sum of weights in a terminal node required for a split}
-\item{use_partykit}{String. In some semi-rare cases \code{partyk::ctree} runs into an error related to the LINPACK
+\item{use_partykit}{String. In some semi-rare cases \code{partykit::ctree} runs into an error related to the LINPACK
used by R. To get around this problem, one may fall back to using the newer (but slower) \code{partykit::ctree}
function, which is a reimplementation of the same method. Setting this parameter to \code{"on_error"} (default)
falls back to \code{partykit::ctree}, if \code{party::ctree} fails. Other options are \code{"never"}, which always
diff --git a/man/default_doc.Rd b/man/default_doc.Rd
index cca1358e3..7ee25632a 100644
--- a/man/default_doc.Rd
+++ b/man/default_doc.Rd
@@ -9,7 +9,8 @@ default_doc(internal, model, predict_model, output_size, extra, ...)
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
\item{model}{Objects.
The model object that ought to be explained.
diff --git a/man/default_doc_explain.Rd b/man/default_doc_explain.Rd
index 6adafa2d7..9fcc39a30 100644
--- a/man/default_doc_explain.Rd
+++ b/man/default_doc_explain.Rd
@@ -7,10 +7,11 @@
default_doc_explain(internal, index_features)
}
\arguments{
-\item{internal}{Not used.}
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
+\item{index_features}{Positive integer vector. Specifies the id_coalition to
+apply to the present method. \code{NULL} means all coalitions. Only used internally.}
}
\description{
Exported documentation helper function.
diff --git a/man/explain.Rd b/man/explain.Rd
index 2b121b12d..dd0e18125 100644
--- a/man/explain.Rd
+++ b/man/explain.Rd
@@ -9,18 +9,21 @@ explain(
x_explain,
x_train,
approach,
+ paired_shap_sampling = TRUE,
prediction_zero,
- n_combinations = NULL,
+ max_n_coalitions = NULL,
+ adaptive = NULL,
group = NULL,
- n_samples = 1000,
- n_batches = NULL,
+ n_MC_samples = 1000,
seed = 1,
keep_samp_for_vS = FALSE,
predict_model = NULL,
get_model_specs = NULL,
MSEv_uniform_comb_weights = TRUE,
- timing = TRUE,
- verbose = 0,
+ verbose = "basic",
+ adaptive_arguments = list(),
+ shapley_reweighting = "on_all_cond",
+ prev_shapr_object = NULL,
...
)
}
@@ -43,17 +46,36 @@ All elements should, either be \code{"gaussian"}, \code{"copula"}, \code{"empiri
\code{"categorical"}, \code{"timeseries"}, \code{"independence"}, \code{"regression_separate"}, or \code{"regression_surrogate"}.
The two regression approaches can not be combined with any other approach. See details for more information.}
+\item{paired_shap_sampling}{Logical.
+If \code{TRUE} (default), paired versions of all sampled coalitions are also included in the computation.
+That is, if there are 5 features and e.g. coalitions (1,3,5) are sampled, then also coalition (2,4) is used for
+computing the Shapley values. This is done to reduce the variance of the Shapley value estimates.}
+
\item{prediction_zero}{Numeric.
The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any
features.
Typically we set this value equal to the mean of the response variable in our training data, but other choices
such as the mean of the predictions in the training data are also reasonable.}
-\item{n_combinations}{Integer.
-If \code{group = NULL}, \code{n_combinations} represents the number of unique feature combinations to sample.
-If \code{group != NULL}, \code{n_combinations} represents the number of unique group combinations to sample.
-If \code{n_combinations = NULL}, the exact method is used and all combinations are considered.
-The maximum number of combinations equals \code{2^m}, where \code{m} is the number of features.}
+\item{max_n_coalitions}{Integer.
+The upper limit on the number of unique feature/group coalitions to use in the adaptive procedure
+(if \code{adaptive = TRUE}).
+If \code{adaptive = FALSE} it represents the number of feature/group coalitions to use directly.
+The quantity refers to the number of unique feature coalitions if \code{group = NULL},
+and group coalitions if \code{group != NULL}.
+\code{max_n_coalitions = NULL} corresponds to \code{max_n_coalitions=2^n_features}.}
+
+\item{adaptive}{Logical or NULL
+If \code{NULL} (default), the argument is set to \code{TRUE} if there are more than 5 features/groups, and \code{FALSE} otherwise.
+If eventually \code{TRUE}, the Shapley values are estimated adaptively in an iterative manner.
+This provides sufficiently accurate Shapley value estimates faster.
+First an initial number of coalitions is sampled, then bootsrapping is used to estimate the variance of the Shapley
+values.
+A convergence criterion is used to determine if the variances of the Shapley values are sufficently small.
+If the variances are too high, we estimate the number of required samples to reach convergence, and thereby add more
+coalitions.
+The process is repeated until the variances are below the threshold.
+Specifics related to the adaptive process and convergence criterion are set through \code{adaptive_arguments}.}
\item{group}{List.
If \code{NULL} regular feature wise Shapley values are computed.
@@ -61,39 +83,34 @@ If provided, group wise Shapley values are computed. \code{group} then has lengt
the number of groups. The list element contains character vectors with the features included
in each of the different groups.}
-\item{n_samples}{Positive integer.
-Indicating the maximum number of samples to use in the
-Monte Carlo integration for every conditional expectation. See also details.}
-
-\item{n_batches}{Positive integer (or NULL).
-Specifies how many batches the total number of feature combinations should be split into when calculating the
-contribution function for each test observation.
-The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed,
-which depends on \code{approach} and \code{n_combinations}.
-For models with many features, increasing the number of batches reduces the RAM allocation significantly.
-This typically comes with a small increase in computation time.}
+\item{n_MC_samples}{Positive integer.
+Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation.
+For \code{approach="ctree"}, \code{n_MC_samples} corresponds to the number of samples
+from the leaf node (see an exception related to the \code{ctree.sample} argument \code{\link[=setup_approach.ctree]{setup_approach.ctree()}}).
+For \code{approach="empirical"}, \code{n_MC_samples} is the \eqn{K} parameter in equations (14-15) of
+Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
+\code{empirical.eta} argument \code{\link[=setup_approach.empirical]{setup_approach.empirical()}}.}
\item{seed}{Positive integer.
Specifies the seed before any randomness based code is being run.
-If \code{NULL} the seed will be inherited from the calling environment.}
+If \code{NULL} no seed is set in the calling environment.}
\item{keep_samp_for_vS}{Logical.
-Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned
-(in \code{internal$output})}
+Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned (in \code{internal$output}).
+Not used for \code{approach="regression_separate"} or \code{approach="regression_surrogate"}.}
\item{predict_model}{Function.
The prediction function used when \code{model} is not natively supported.
-(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported
-models.)
+(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported models.)
The function must have two arguments, \code{model} and \code{newdata} which specify, respectively, the model
-and a data.frame/data.table to compute predictions for. The function must give the prediction as a numeric vector.
+and a data.frame/data.table to compute predictions for.
+The function must give the prediction as a numeric vector.
\code{NULL} (the default) uses functions specified internally.
Can also be used to override the default function for natively supported model classes.}
\item{get_model_specs}{Function.
An optional function for checking model/data consistency when \code{model} is not natively supported.
-(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported
-models.)
+(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported models.)
The function takes \code{model} as argument and provides a list with 3 elements:
\describe{
\item{labels}{Character vector with the names of each feature.}
@@ -104,18 +121,51 @@ If \code{NULL} (the default) internal functions are used for natively supported
disabled for unsupported model classes.
Can also be used to override the default function for natively supported model classes.}
-\item{MSEv_uniform_comb_weights}{Logical. If \code{TRUE} (default), then the function weights the combinations
-uniformly when computing the MSEv criterion. If \code{FALSE}, then the function use the Shapley kernel weights to
-weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the
-sampling frequency when not all combinations are considered.}
-
-\item{timing}{Logical.
-Whether the timing of the different parts of the \code{explain()} should saved in the model object.}
-
-\item{verbose}{An integer specifying the level of verbosity. If \code{0}, \code{shapr} will stay silent.
-If \code{1}, it will print information about performance. If \code{2}, some additional information will be printed out.
-Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} for high verbose.
-TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.}
+\item{MSEv_uniform_comb_weights}{Logical.
+If \code{TRUE} (default), then the function weights the coalitions uniformly when computing the MSEv criterion.
+If \code{FALSE}, then the function use the Shapley kernel weights to weight the coalitions when computing the MSEv
+criterion.
+Note that the Shapley kernel weights are replaced by the sampling frequency when not all coalitions are considered.}
+
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
+
+\item{adaptive_arguments}{Named list.
+Specifices the arguments for the adaptive procedure.
+See \code{\link[=get_adaptive_arguments_default]{get_adaptive_arguments_default()}} for description of the arguments and their default values.}
+
+\item{shapley_reweighting}{String.
+How to reweight the sampling frequency weights in the kernelSHAP solution after sampling, with the aim of reducing
+the randomness and thereby the variance of the Shapley value estimates.
+One of \code{'none'}, \code{'on_N'}, \code{'on_all'}, \code{'on_all_cond'} (default).
+\code{'none'} means no reweighting, i.e. the sampling frequency weights are used as is.
+\code{'on_coal_size'} means the sampling frequencies are averaged over all coalitions of the same size.
+\code{'on_N'} means the sampling frequencies are averaged over all coalitions with the same original sampling
+probabilities.
+\code{'on_all'} means the original sampling probabilities are used for all coalitions.
+\code{'on_all_cond'} means the original sampling probabilities are used for all coalitions, while adjusting for the
+probability that they are sampled at least once.
+This method is preferred as it has performed the best in simulation studies.}
+
+\item{prev_shapr_object}{\code{shapr} object or string.
+If an object of class \code{shapr} is provided or string with a path to where intermediate results are strored,
+then the function will use the previous object to continue the computation.
+This is useful if the computation is interrupted or you want higher accuracy than already obtained, and therefore
+want to continue the adaptive estimation. See the vignette for examples.}
\item{...}{
Arguments passed on to \code{\link[=setup_approach.empirical]{setup_approach.empirical}}, \code{\link[=setup_approach.independence]{setup_approach.independence}}, \code{\link[=setup_approach.gaussian]{setup_approach.gaussian}}, \code{\link[=setup_approach.copula]{setup_approach.copula}}, \code{\link[=setup_approach.ctree]{setup_approach.ctree}}, \code{\link[=setup_approach.vaeac]{setup_approach.vaeac}}, \code{\link[=setup_approach.categorical]{setup_approach.categorical}}, \code{\link[=setup_approach.regression_separate]{setup_approach.regression_separate}}, \code{\link[=setup_approach.regression_surrogate]{setup_approach.regression_surrogate}}, \code{\link[=setup_approach.timeseries]{setup_approach.timeseries}}
@@ -130,7 +180,7 @@ If e.g. \code{eta = .8} we will choose the \code{K} samples with the largest wei
accounts for 80\\% of the total weight.
\code{eta} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).}
\item{\code{empirical.fixed_sigma}}{Positive numeric scalar. (default = 0.1)
-Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations.
+Represents the kernel bandwidth in the distance computation used when conditioning on all different coalitions.
Only used when \code{empirical.type = "fixed_sigma"}}
\item{\code{empirical.n_samples_aicc}}{Positive integer. (default = 1000)
Number of samples to consider in AICc optimization.
@@ -144,7 +194,8 @@ Only used for \code{empirical.type} is either \code{"AICc_each_k"} or \code{"AIC
\item{\code{empirical.cov_mat}}{Numeric matrix. (Optional, default = NULL)
Containing the covariance matrix of the data generating distribution used to define the Mahalanobis distance.
\code{NULL} means it is estimated from \code{x_train}.}
- \item{\code{internal}}{Not used.}
+ \item{\code{internal}}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
\item{\code{gaussian.mu}}{Numeric vector. (Optional)
Containing the mean of the data generating distribution.
\code{NULL} means it is estimated from the \code{x_train}.}
@@ -160,13 +211,13 @@ Determines minimum value that the sum of the left and right daughter nodes requi
\item{\code{ctree.minbucket}}{Numeric scalar. (default = 7)
Determines the minimum sum of weights in a terminal node required for a split}
\item{\code{ctree.sample}}{Boolean. (default = TRUE)
-If TRUE, then the method always samples \code{n_samples} observations from the leaf nodes (with replacement).
-If FALSE and the number of observations in the leaf node is less than \code{n_samples},
+If TRUE, then the method always samples \code{n_MC_samples} observations from the leaf nodes (with replacement).
+If FALSE and the number of observations in the leaf node is less than \code{n_MC_samples},
the method will take all observations in the leaf.
-If FALSE and the number of observations in the leaf node is more than \code{n_samples},
-the method will sample \code{n_samples} observations (with replacement).
+If FALSE and the number of observations in the leaf node is more than \code{n_MC_samples},
+the method will sample \code{n_MC_samples} observations (with replacement).
This means that there will always be sampling in the leaf unless
-\code{sample} = FALSE AND the number of obs in the node is less than \code{n_samples}.}
+\code{sample} = FALSE AND the number of obs in the node is less than \code{n_MC_samples}.}
\item{\code{vaeac.depth}}{Positive integer (default is \code{3}). The number of hidden layers
in the neural networks of the masked encoder, full encoder, and decoder.}
\item{\code{vaeac.width}}{Positive integer (default is \code{32}). The number of neurons in each
@@ -201,8 +252,8 @@ is also a valid input. It is essential to include the package prefix if the pack
The data.frame must contain the possible hyperparameter value combinations to try.
The column names must match the names of the tuneable parameters specified in \code{regression.model}.
If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data
-for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
vignette for several examples.
Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string
containing an R function. For example,
@@ -218,13 +269,17 @@ Note, to make it easier to call \code{explain()} from Python, the \code{regressi
containing an R function. For example,
\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also
a valid input. It is essential to include the package prefix if the package is not loaded.}
- \item{\code{regression.surrogate_n_comb}}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the
-number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
-"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory
-in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all
-\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the
-user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique
-coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.}
+ \item{\code{regression.surrogate_n_comb}}{Integer.
+(default is \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions}) specifying the
+number of unique coalitions to apply to each training observation. Maximum allowed value is
+"\code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions} - 2".
+By default, we use all coalitions, but this can take a lot of memory in larger dimensions.
+Note that by "all", we mean all coalitions chosen by \code{shapr} to be used.
+This will be all \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in
+the exact mode.
+If the user sets a lower value than \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions},
+then we sample this amount of unique coalitions separately for each training observations.
+That is, on average, all coalitions should be equally trained.}
\item{\code{timeseries.fixed_sigma_vec}}{Numeric. (Default = 2)
Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?}
\item{\code{timeseries.bounds}}{Numeric vector of length two. (Default = c(NULL, NULL))
@@ -236,58 +291,45 @@ This is useful if the underlying time series are scaled between 0 and 1, for exa
\value{
Object of class \code{c("shapr", "list")}. Contains the following items:
\describe{
-\item{shapley_values}{data.table with the estimated Shapley values}
-\item{internal}{List with the different parameters, data and functions used internally}
+\item{shapley_values}{data.table with the estimated Shapley values with explained observation in the rows and
+features along the columns.
+The column \code{none} is the prediction not devoted to any of the features (given by the argument \code{prediction_zero})}
+\item{shapley_values_sd}{data.table with the standard deviation of the Shapley values reflecting the uncertainty.
+Note that this only reflects the coalition sampling part of the kernelSHAP procedure, and is therefore by
+definition 0 when all coalitions is used.
+Only present when \code{adaptive = TRUE} and \code{adaptive_arguments$compute_sd=TRUE}.}
+\item{internal}{List with the different parameters, data, functions and other output used internally.}
\item{pred_explain}{Numeric vector with the predictions for the explained observations}
-\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach.}
+\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach. See the
+\href{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#msev-evaluation-criterion
+}{MSEv evaluation section in the vignette for details}.}
+\item{timing}{List containing timing information for the different parts of the computation.
+\code{init_time} and \code{end_time} gives the time stamps for the start and end of the computation.
+\code{total_time_secs} gives the total time in seconds for the complete execution of \code{explain()}.
+\code{main_timing_secs} gives the time in seconds for the main computations.
+\code{iter_timing_secs} gives for each iteration of the adaptive estimation, the time spent on the different parts
+adaptive estimation routine.}
}
-
-\code{shapley_values} is a data.table where the number of rows equals
-the number of observations you'd like to explain, and the number of columns equals \code{m +1},
-where \code{m} equals the total number of features in your model.
-
-If \code{shapley_values[i, j + 1] > 0} it indicates that the j-th feature increased the prediction for
-the i-th observation. Likewise, if \code{shapley_values[i, j + 1] < 0} it indicates that the j-th feature
-decreased the prediction for the i-th observation.
-The magnitude of the value is also important to notice. E.g. if \code{shapley_values[i, k + 1]} and
-\code{shapley_values[i, j + 1]} are greater than \code{0}, where \code{j != k}, and
-\code{shapley_values[i, k + 1]} > \code{shapley_values[i, j + 1]} this indicates that feature
-\code{j} and \code{k} both increased the value of the prediction, but that the effect of the k-th
-feature was larger than the j-th feature.
-
-The first column in \code{dt}, called \code{none}, is the prediction value not assigned to any of the features
-(\ifelse{html}{\eqn{\phi}\out{0}}{\eqn{\phi_0}}).
-It's equal for all observations and set by the user through the argument \code{prediction_zero}.
-The difference between the prediction and \code{none} is distributed among the other features.
-In theory this value should be the expected prediction without conditioning on any features.
-Typically we set this value equal to the mean of the response variable in our training data, but other choices
-such as the mean of the predictions in the training data are also reasonable.
}
\description{
Computes dependence-aware Shapley values for observations in \code{x_explain} from the specified
\code{model} by using the method specified in \code{approach} to estimate the conditional expectation.
}
\details{
-The most important thing to notice is that \code{shapr} has implemented eight different
-Monte Carlo-based approaches for estimating the conditional distributions of the data, namely \code{"empirical"},
-\code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}.
-\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"},
-and see the separate vignette on the regression-based approaches for more information.
-In addition, the user also has the option of combining the different Monte Carlo-based approaches.
-E.g., if you're in a situation where you have trained a model that consists of 10 features,
-and you'd like to use the \code{"gaussian"} approach when you condition on a single feature,
-the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version
-if you condition on more than 5 features this can be done by simply passing
-\code{approach = c("gaussian", rep("empirical", 4), rep("copula", 4))}. If
-\code{"approach[i]" = "gaussian"} means that you'd like to use the \code{"gaussian"} approach
-when conditioning on \code{i} features. Conditioning on all features needs no approach as that is given
-by the complete prediction itself, and should thus not be part of the vector.
-
-For \code{approach="ctree"}, \code{n_samples} corresponds to the number of samples
-from the leaf node (see an exception related to the \code{sample} argument).
-For \code{approach="empirical"}, \code{n_samples} is the \eqn{K} parameter in equations (14-15) of
-Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
-\code{empirical.eta} argument.
+The \code{shapr} package implements kernelSHAP estimation of dependence-aware Shapley values with
+eight different Monte Carlo-based approaches for estimating the conditional distributions of the data, namely
+\code{"empirical"}, \code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}.
+\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"}.
+It is also possible to combine the different approaches, see the vignettes for more information.
+
+The package allows for parallelized computation with progress updates through the tightly connected
+\link[future:future]{future::future} and \link[progressr:progressr]{progressr::progressr} packages. See the examples below.
+For adaptive estimation (\code{adaptive=TRUE}), intermediate results may also be printed to the console
+(according to the \code{verbose} argument).
+Moreover, the intermediate results are written to disk.
+This combined with adaptive estimation with (optional) intermediate results printed to the console (and temporary
+written to disk, and batch computing of the v(S) values, enables fast and accurate estimation of the Shapley values
+in a memory friendly manner.
}
\examples{
@@ -311,6 +353,18 @@ model <- lm(lm_formula, data = data_train)
# Explain predictions
p <- mean(data_train[, y_var])
+\dontrun{
+# (Optionally) enable parallelization via the future package
+if (requireNamespace("future", quietly = TRUE)) {
+ future::plan("multisession", workers = 2)
+}
+}
+
+# (Optionally) enable progress updates within every iteration via the progressr package
+if (requireNamespace("progressr", quietly = TRUE)) {
+ progressr::handlers(global = TRUE)
+}
+
# Empirical approach
explain1 <- explain(
model = model,
@@ -318,7 +372,7 @@ explain1 <- explain(
x_train = x_train,
approach = "empirical",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Gaussian approach
@@ -328,7 +382,7 @@ explain2 <- explain(
x_train = x_train,
approach = "gaussian",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Gaussian copula approach
@@ -338,7 +392,7 @@ explain3 <- explain(
x_train = x_train,
approach = "copula",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# ctree approach
@@ -348,7 +402,7 @@ explain4 <- explain(
x_train = x_train,
approach = "ctree",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Combined approach
@@ -359,7 +413,7 @@ explain5 <- explain(
x_train = x_train,
approach = approach,
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Print the Shapley values
@@ -381,7 +435,7 @@ explain_groups <- explain(
group = group_list,
approach = "empirical",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
print(explain_groups$shapley_values)
@@ -409,6 +463,21 @@ explain_surrogate_lm <- explain(
regression.model = parsnip::linear_reg()
)
+## Adaptive estimation
+# For illustration purposes only. By default not used for such small dimensions as here
+
+# Gaussian approach
+explain_adaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p,
+ n_MC_samples = 1e2,
+ adaptive = TRUE,
+ adaptive_arguments = list(initial_n_coalitions = 10)
+)
+
}
\references{
Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent:
diff --git a/man/explain_forecast.Rd b/man/explain_forecast.Rd
index 91565d96d..45eca933b 100644
--- a/man/explain_forecast.Rd
+++ b/man/explain_forecast.Rd
@@ -15,17 +15,15 @@ explain_forecast(
horizon,
approach,
prediction_zero,
- n_combinations = NULL,
+ max_n_coalitions = NULL,
group_lags = TRUE,
group = NULL,
- n_samples = 1000,
- n_batches = NULL,
+ n_MC_samples = 1000,
seed = 1,
keep_samp_for_vS = FALSE,
predict_model = NULL,
get_model_specs = NULL,
- timing = TRUE,
- verbose = 0,
+ verbose = "basic",
...
)
}
@@ -76,11 +74,13 @@ features.
Typically we set this value equal to the mean of the response variable in our training data, but other choices
such as the mean of the predictions in the training data are also reasonable.}
-\item{n_combinations}{Integer.
-If \code{group = NULL}, \code{n_combinations} represents the number of unique feature combinations to sample.
-If \code{group != NULL}, \code{n_combinations} represents the number of unique group combinations to sample.
-If \code{n_combinations = NULL}, the exact method is used and all combinations are considered.
-The maximum number of combinations equals \code{2^m}, where \code{m} is the number of features.}
+\item{max_n_coalitions}{Integer.
+The upper limit on the number of unique feature/group coalitions to use in the adaptive procedure
+(if \code{adaptive = TRUE}).
+If \code{adaptive = FALSE} it represents the number of feature/group coalitions to use directly.
+The quantity refers to the number of unique feature coalitions if \code{group = NULL},
+and group coalitions if \code{group != NULL}.
+\code{max_n_coalitions = NULL} corresponds to \code{max_n_coalitions=2^n_features}.}
\item{group_lags}{Logical.
If \code{TRUE} all lags of each variable are grouped together and explained as a group.
@@ -92,39 +92,34 @@ If provided, group wise Shapley values are computed. \code{group} then has lengt
the number of groups. The list element contains character vectors with the features included
in each of the different groups.}
-\item{n_samples}{Positive integer.
-Indicating the maximum number of samples to use in the
-Monte Carlo integration for every conditional expectation. See also details.}
-
-\item{n_batches}{Positive integer (or NULL).
-Specifies how many batches the total number of feature combinations should be split into when calculating the
-contribution function for each test observation.
-The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed,
-which depends on \code{approach} and \code{n_combinations}.
-For models with many features, increasing the number of batches reduces the RAM allocation significantly.
-This typically comes with a small increase in computation time.}
+\item{n_MC_samples}{Positive integer.
+Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation.
+For \code{approach="ctree"}, \code{n_MC_samples} corresponds to the number of samples
+from the leaf node (see an exception related to the \code{ctree.sample} argument \code{\link[=setup_approach.ctree]{setup_approach.ctree()}}).
+For \code{approach="empirical"}, \code{n_MC_samples} is the \eqn{K} parameter in equations (14-15) of
+Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
+\code{empirical.eta} argument \code{\link[=setup_approach.empirical]{setup_approach.empirical()}}.}
\item{seed}{Positive integer.
Specifies the seed before any randomness based code is being run.
-If \code{NULL} the seed will be inherited from the calling environment.}
+If \code{NULL} no seed is set in the calling environment.}
\item{keep_samp_for_vS}{Logical.
-Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned
-(in \code{internal$output})}
+Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned (in \code{internal$output}).
+Not used for \code{approach="regression_separate"} or \code{approach="regression_surrogate"}.}
\item{predict_model}{Function.
The prediction function used when \code{model} is not natively supported.
-(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported
-models.)
+(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported models.)
The function must have two arguments, \code{model} and \code{newdata} which specify, respectively, the model
-and a data.frame/data.table to compute predictions for. The function must give the prediction as a numeric vector.
+and a data.frame/data.table to compute predictions for.
+The function must give the prediction as a numeric vector.
\code{NULL} (the default) uses functions specified internally.
Can also be used to override the default function for natively supported model classes.}
\item{get_model_specs}{Function.
An optional function for checking model/data consistency when \code{model} is not natively supported.
-(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported
-models.)
+(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported models.)
The function takes \code{model} as argument and provides a list with 3 elements:
\describe{
\item{labels}{Character vector with the names of each feature.}
@@ -135,13 +130,22 @@ If \code{NULL} (the default) internal functions are used for natively supported
disabled for unsupported model classes.
Can also be used to override the default function for natively supported model classes.}
-\item{timing}{Logical.
-Whether the timing of the different parts of the \code{explain()} should saved in the model object.}
-
-\item{verbose}{An integer specifying the level of verbosity. If \code{0}, \code{shapr} will stay silent.
-If \code{1}, it will print information about performance. If \code{2}, some additional information will be printed out.
-Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} for high verbose.
-TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{...}{
Arguments passed on to \code{\link[=setup_approach.empirical]{setup_approach.empirical}}, \code{\link[=setup_approach.independence]{setup_approach.independence}}, \code{\link[=setup_approach.gaussian]{setup_approach.gaussian}}, \code{\link[=setup_approach.copula]{setup_approach.copula}}, \code{\link[=setup_approach.ctree]{setup_approach.ctree}}, \code{\link[=setup_approach.vaeac]{setup_approach.vaeac}}, \code{\link[=setup_approach.categorical]{setup_approach.categorical}}, \code{\link[=setup_approach.timeseries]{setup_approach.timeseries}}
@@ -156,7 +160,7 @@ If e.g. \code{eta = .8} we will choose the \code{K} samples with the largest wei
accounts for 80\\% of the total weight.
\code{eta} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).}
\item{\code{empirical.fixed_sigma}}{Positive numeric scalar. (default = 0.1)
-Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations.
+Represents the kernel bandwidth in the distance computation used when conditioning on all different coalitions.
Only used when \code{empirical.type = "fixed_sigma"}}
\item{\code{empirical.n_samples_aicc}}{Positive integer. (default = 1000)
Number of samples to consider in AICc optimization.
@@ -170,7 +174,8 @@ Only used for \code{empirical.type} is either \code{"AICc_each_k"} or \code{"AIC
\item{\code{empirical.cov_mat}}{Numeric matrix. (Optional, default = NULL)
Containing the covariance matrix of the data generating distribution used to define the Mahalanobis distance.
\code{NULL} means it is estimated from \code{x_train}.}
- \item{\code{internal}}{Not used.}
+ \item{\code{internal}}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
\item{\code{gaussian.mu}}{Numeric vector. (Optional)
Containing the mean of the data generating distribution.
\code{NULL} means it is estimated from the \code{x_train}.}
@@ -186,13 +191,13 @@ Determines minimum value that the sum of the left and right daughter nodes requi
\item{\code{ctree.minbucket}}{Numeric scalar. (default = 7)
Determines the minimum sum of weights in a terminal node required for a split}
\item{\code{ctree.sample}}{Boolean. (default = TRUE)
-If TRUE, then the method always samples \code{n_samples} observations from the leaf nodes (with replacement).
-If FALSE and the number of observations in the leaf node is less than \code{n_samples},
+If TRUE, then the method always samples \code{n_MC_samples} observations from the leaf nodes (with replacement).
+If FALSE and the number of observations in the leaf node is less than \code{n_MC_samples},
the method will take all observations in the leaf.
-If FALSE and the number of observations in the leaf node is more than \code{n_samples},
-the method will sample \code{n_samples} observations (with replacement).
+If FALSE and the number of observations in the leaf node is more than \code{n_MC_samples},
+the method will sample \code{n_MC_samples} observations (with replacement).
This means that there will always be sampling in the leaf unless
-\code{sample} = FALSE AND the number of obs in the node is less than \code{n_samples}.}
+\code{sample} = FALSE AND the number of obs in the node is less than \code{n_MC_samples}.}
\item{\code{vaeac.depth}}{Positive integer (default is \code{3}). The number of hidden layers
in the neural networks of the masked encoder, full encoder, and decoder.}
\item{\code{vaeac.width}}{Positive integer (default is \code{32}). The number of neurons in each
@@ -228,32 +233,25 @@ This is useful if the underlying time series are scaled between 0 and 1, for exa
\value{
Object of class \code{c("shapr", "list")}. Contains the following items:
\describe{
-\item{shapley_values}{data.table with the estimated Shapley values}
-\item{internal}{List with the different parameters, data and functions used internally}
+\item{shapley_values}{data.table with the estimated Shapley values with explained observation in the rows and
+features along the columns.
+The column \code{none} is the prediction not devoted to any of the features (given by the argument \code{prediction_zero})}
+\item{shapley_values_sd}{data.table with the standard deviation of the Shapley values reflecting the uncertainty.
+Note that this only reflects the coalition sampling part of the kernelSHAP procedure, and is therefore by
+definition 0 when all coalitions is used.
+Only present when \code{adaptive = TRUE} and \code{adaptive_arguments$compute_sd=TRUE}.}
+\item{internal}{List with the different parameters, data, functions and other output used internally.}
\item{pred_explain}{Numeric vector with the predictions for the explained observations}
-\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach.}
+\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach. See the
+\href{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#msev-evaluation-criterion
+}{MSEv evaluation section in the vignette for details}.}
+\item{timing}{List containing timing information for the different parts of the computation.
+\code{init_time} and \code{end_time} gives the time stamps for the start and end of the computation.
+\code{total_time_secs} gives the total time in seconds for the complete execution of \code{explain()}.
+\code{main_timing_secs} gives the time in seconds for the main computations.
+\code{iter_timing_secs} gives for each iteration of the adaptive estimation, the time spent on the different parts
+adaptive estimation routine.}
}
-
-\code{shapley_values} is a data.table where the number of rows equals
-the number of observations you'd like to explain, and the number of columns equals \code{m +1},
-where \code{m} equals the total number of features in your model.
-
-If \code{shapley_values[i, j + 1] > 0} it indicates that the j-th feature increased the prediction for
-the i-th observation. Likewise, if \code{shapley_values[i, j + 1] < 0} it indicates that the j-th feature
-decreased the prediction for the i-th observation.
-The magnitude of the value is also important to notice. E.g. if \code{shapley_values[i, k + 1]} and
-\code{shapley_values[i, j + 1]} are greater than \code{0}, where \code{j != k}, and
-\code{shapley_values[i, k + 1]} > \code{shapley_values[i, j + 1]} this indicates that feature
-\code{j} and \code{k} both increased the value of the prediction, but that the effect of the k-th
-feature was larger than the j-th feature.
-
-The first column in \code{dt}, called \code{none}, is the prediction value not assigned to any of the features
-(\ifelse{html}{\eqn{\phi}\out{0}}{\eqn{\phi_0}}).
-It's equal for all observations and set by the user through the argument \code{prediction_zero}.
-The difference between the prediction and \code{none} is distributed among the other features.
-In theory this value should be the expected prediction without conditioning on any features.
-Typically we set this value equal to the mean of the response variable in our training data, but other choices
-such as the mean of the predictions in the training data are also reasonable.
}
\description{
Computes dependence-aware Shapley values for observations in \code{explain_idx} from the specified
diff --git a/man/explain_tripledot_docs.Rd b/man/explain_tripledot_docs.Rd
index a739b97b5..ef2409009 100644
--- a/man/explain_tripledot_docs.Rd
+++ b/man/explain_tripledot_docs.Rd
@@ -20,7 +20,7 @@ If e.g. \code{eta = .8} we will choose the \code{K} samples with the largest wei
accounts for 80\\% of the total weight.
\code{eta} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).}
\item{\code{empirical.fixed_sigma}}{Positive numeric scalar. (default = 0.1)
-Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations.
+Represents the kernel bandwidth in the distance computation used when conditioning on all different coalitions.
Only used when \code{empirical.type = "fixed_sigma"}}
\item{\code{empirical.n_samples_aicc}}{Positive integer. (default = 1000)
Number of samples to consider in AICc optimization.
@@ -52,13 +52,13 @@ Determines minimum value that the sum of the left and right daughter nodes requi
\item{\code{ctree.minbucket}}{Numeric scalar. (default = 7)
Determines the minimum sum of weights in a terminal node required for a split}
\item{\code{ctree.sample}}{Boolean. (default = TRUE)
-If TRUE, then the method always samples \code{n_samples} observations from the leaf nodes (with replacement).
-If FALSE and the number of observations in the leaf node is less than \code{n_samples},
+If TRUE, then the method always samples \code{n_MC_samples} observations from the leaf nodes (with replacement).
+If FALSE and the number of observations in the leaf node is less than \code{n_MC_samples},
the method will take all observations in the leaf.
-If FALSE and the number of observations in the leaf node is more than \code{n_samples},
-the method will sample \code{n_samples} observations (with replacement).
+If FALSE and the number of observations in the leaf node is more than \code{n_MC_samples},
+the method will sample \code{n_MC_samples} observations (with replacement).
This means that there will always be sampling in the leaf unless
-\code{sample} = FALSE AND the number of obs in the node is less than \code{n_samples}.}
+\code{sample} = FALSE AND the number of obs in the node is less than \code{n_MC_samples}.}
\item{\code{gaussian.mu}}{Numeric vector. (Optional)
Containing the mean of the data generating distribution.
\code{NULL} means it is estimated from the \code{x_train}.}
@@ -75,8 +75,8 @@ is also a valid input. It is essential to include the package prefix if the pack
The data.frame must contain the possible hyperparameter value combinations to try.
The column names must match the names of the tuneable parameters specified in \code{regression.model}.
If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data
-for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
vignette for several examples.
Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string
containing an R function. For example,
@@ -92,13 +92,17 @@ Note, to make it easier to call \code{explain()} from Python, the \code{regressi
containing an R function. For example,
\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also
a valid input. It is essential to include the package prefix if the package is not loaded.}
- \item{\code{regression.surrogate_n_comb}}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the
-number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
-"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory
-in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all
-\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the
-user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique
-coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.}
+ \item{\code{regression.surrogate_n_comb}}{Integer.
+(default is \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions}) specifying the
+number of unique coalitions to apply to each training observation. Maximum allowed value is
+"\code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions} - 2".
+By default, we use all coalitions, but this can take a lot of memory in larger dimensions.
+Note that by "all", we mean all coalitions chosen by \code{shapr} to be used.
+This will be all \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in
+the exact mode.
+If the user sets a lower value than \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions},
+then we sample this amount of unique coalitions separately for each training observations.
+That is, on average, all coalitions should be equally trained.}
\item{\code{timeseries.fixed_sigma_vec}}{Numeric. (Default = 2)
Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?}
\item{\code{timeseries.bounds}}{Numeric vector of length two. (Default = c(NULL, NULL))
@@ -125,7 +129,7 @@ This includes \code{vaeac.extra_parameters$epochs_initiation_phase}, where the d
\description{
This helper function displays the specific arguments applicable to the different
approaches. Note that when calling \code{\link[=explain]{explain()}} from Python, the parameters
-are renamed from the form \code{approach.parameter_name} to \code{approach_parameter_name}.
+are renamed from the from \code{approach.parameter_name} to \code{approach_parameter_name}.
That is, an underscore has replaced the dot as the dot is reserved in Python.
}
\author{
diff --git a/man/feature_combinations.Rd b/man/feature_combinations.Rd
deleted file mode 100644
index f6b6c4220..000000000
--- a/man/feature_combinations.Rd
+++ /dev/null
@@ -1,58 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
-\name{feature_combinations}
-\alias{feature_combinations}
-\title{Define feature combinations, and fetch additional information about each unique combination}
-\usage{
-feature_combinations(
- m,
- exact = TRUE,
- n_combinations = 200,
- weight_zero_m = 10^6,
- group_num = NULL
-)
-}
-\arguments{
-\item{m}{Positive integer. Total number of features.}
-
-\item{exact}{Logical. If \code{TRUE} all \code{2^m} combinations are generated, otherwise a
-subsample of the combinations is used.}
-
-\item{n_combinations}{Positive integer. Note that if \code{exact = TRUE},
-\code{n_combinations} is ignored. However, if \code{m > 12} you'll need to add a positive integer
-value for \code{n_combinations}.}
-
-\item{weight_zero_m}{Numeric. The value to use as a replacement for infinite combination
-weights when doing numerical operations.}
-
-\item{group_num}{List. Contains vector of integers indicating the feature numbers for the
-different groups.}
-}
-\value{
-A data.table that contains the following columns:
-\describe{
-\item{id_combination}{Positive integer. Represents a unique key for each combination. Note that the table
-is sorted by \code{id_combination}, so that is always equal to \code{x[["id_combination"]] = 1:nrow(x)}.}
-\item{features}{List. Each item of the list is an integer vector where \code{features[[i]]}
-represents the indices of the features included in combination \code{i}. Note that all the items
-are sorted such that \code{features[[i]] == sort(features[[i]])} is always true.}
-\item{n_features}{Vector of positive integers. \code{n_features[i]} equals the number of features in combination
-\code{i}, i.e. \code{n_features[i] = length(features[[i]])}.}.
-\item{N}{Positive integer. The number of unique ways to sample \code{n_features[i]} features
-from \code{m} different features, without replacement.}
-}
-}
-\description{
-Define feature combinations, and fetch additional information about each unique combination
-}
-\examples{
-# All combinations
-x <- feature_combinations(m = 3)
-nrow(x) # Equals 2^3 = 8
-
-# Subsample of combinations
-x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2)
-}
-\author{
-Nikolai Sellereite, Martin Jullum
-}
diff --git a/man/feature_group.Rd b/man/feature_group.Rd
deleted file mode 100644
index ce6775245..000000000
--- a/man/feature_group.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
-\name{feature_group}
-\alias{feature_group}
-\title{Analogue to feature_exact, but for groups instead.}
-\usage{
-feature_group(group_num, weight_zero_m = 10^6)
-}
-\arguments{
-\item{group_num}{List. Contains vector of integers indicating the feature numbers for the
-different groups.}
-
-\item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special
-cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.}
-}
-\value{
-data.table with all feature group combinations, shapley weights etc.
-}
-\description{
-Analogue to feature_exact, but for groups instead.
-}
-\keyword{internal}
diff --git a/man/feature_group_not_exact.Rd b/man/feature_group_not_exact.Rd
deleted file mode 100644
index da4d90d66..000000000
--- a/man/feature_group_not_exact.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
-\name{feature_group_not_exact}
-\alias{feature_group_not_exact}
-\title{Analogue to feature_not_exact, but for groups instead.}
-\usage{
-feature_group_not_exact(group_num, n_combinations = 200, weight_zero_m = 10^6)
-}
-\arguments{
-\item{group_num}{List. Contains vector of integers indicating the feature numbers for the
-different groups.}
-
-\item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special
-cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.}
-}
-\value{
-data.table with all feature group combinations, shapley weights etc.
-}
-\description{
-Analogue to feature_not_exact, but for groups instead.
-}
-\keyword{internal}
diff --git a/man/feature_matrix_cpp.Rd b/man/feature_matrix_cpp.Rd
deleted file mode 100644
index 8282cf1f2..000000000
--- a/man/feature_matrix_cpp.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/RcppExports.R
-\name{feature_matrix_cpp}
-\alias{feature_matrix_cpp}
-\title{Get feature matrix}
-\usage{
-feature_matrix_cpp(features, m)
-}
-\arguments{
-\item{features}{List}
-
-\item{m}{Positive integer. Total number of features}
-}
-\value{
-Matrix
-}
-\description{
-Get feature matrix
-}
-\author{
-Nikolai Sellereite
-}
-\keyword{internal}
diff --git a/man/finalize_explanation.Rd b/man/finalize_explanation.Rd
index ee74c8903..cb92dcfdd 100644
--- a/man/finalize_explanation.Rd
+++ b/man/finalize_explanation.Rd
@@ -2,199 +2,14 @@
% Please edit documentation in R/finalize_explanation.R
\name{finalize_explanation}
\alias{finalize_explanation}
-\title{Computes the Shapley values given \code{v(S)}}
+\title{Gathers the final output to create the explanation object}
\usage{
-finalize_explanation(vS_list, internal)
+finalize_explanation(internal)
}
\arguments{
-\item{vS_list}{List
-Output from \code{\link[=compute_vS]{compute_vS()}}}
-
\item{internal}{List.
-Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
-}
-\value{
-Object of class \code{c("shapr", "list")}. Contains the following items:
-\describe{
-\item{shapley_values}{data.table with the estimated Shapley values}
-\item{internal}{List with the different parameters, data and functions used internally}
-\item{pred_explain}{Numeric vector with the predictions for the explained observations}
-\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach.}
-}
-
-\code{shapley_values} is a data.table where the number of rows equals
-the number of observations you'd like to explain, and the number of columns equals \code{m +1},
-where \code{m} equals the total number of features in your model.
-
-If \code{shapley_values[i, j + 1] > 0} it indicates that the j-th feature increased the prediction for
-the i-th observation. Likewise, if \code{shapley_values[i, j + 1] < 0} it indicates that the j-th feature
-decreased the prediction for the i-th observation.
-The magnitude of the value is also important to notice. E.g. if \code{shapley_values[i, k + 1]} and
-\code{shapley_values[i, j + 1]} are greater than \code{0}, where \code{j != k}, and
-\code{shapley_values[i, k + 1]} > \code{shapley_values[i, j + 1]} this indicates that feature
-\code{j} and \code{k} both increased the value of the prediction, but that the effect of the k-th
-feature was larger than the j-th feature.
-
-The first column in \code{dt}, called \code{none}, is the prediction value not assigned to any of the features
-(\ifelse{html}{\eqn{\phi}\out{0}}{\eqn{\phi_0}}).
-It's equal for all observations and set by the user through the argument \code{prediction_zero}.
-The difference between the prediction and \code{none} is distributed among the other features.
-In theory this value should be the expected prediction without conditioning on any features.
-Typically we set this value equal to the mean of the response variable in our training data, but other choices
-such as the mean of the predictions in the training data are also reasonable.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
}
\description{
-Computes dependence-aware Shapley values for observations in \code{x_explain} from the specified
-\code{model} by using the method specified in \code{approach} to estimate the conditional expectation.
-}
-\details{
-The most important thing to notice is that \code{shapr} has implemented eight different
-Monte Carlo-based approaches for estimating the conditional distributions of the data, namely \code{"empirical"},
-\code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}.
-\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"},
-and see the separate vignette on the regression-based approaches for more information.
-In addition, the user also has the option of combining the different Monte Carlo-based approaches.
-E.g., if you're in a situation where you have trained a model that consists of 10 features,
-and you'd like to use the \code{"gaussian"} approach when you condition on a single feature,
-the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version
-if you condition on more than 5 features this can be done by simply passing
-\code{approach = c("gaussian", rep("empirical", 4), rep("copula", 4))}. If
-\code{"approach[i]" = "gaussian"} means that you'd like to use the \code{"gaussian"} approach
-when conditioning on \code{i} features. Conditioning on all features needs no approach as that is given
-by the complete prediction itself, and should thus not be part of the vector.
-
-For \code{approach="ctree"}, \code{n_samples} corresponds to the number of samples
-from the leaf node (see an exception related to the \code{sample} argument).
-For \code{approach="empirical"}, \code{n_samples} is the \eqn{K} parameter in equations (14-15) of
-Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
-\code{empirical.eta} argument.
-}
-\examples{
-
-# Load example data
-data("airquality")
-airquality <- airquality[complete.cases(airquality), ]
-x_var <- c("Solar.R", "Wind", "Temp", "Month")
-y_var <- "Ozone"
-
-# Split data into test- and training data
-data_train <- head(airquality, -3)
-data_explain <- tail(airquality, 3)
-
-x_train <- data_train[, x_var]
-x_explain <- data_explain[, x_var]
-
-# Fit a linear model
-lm_formula <- as.formula(paste0(y_var, " ~ ", paste0(x_var, collapse = " + ")))
-model <- lm(lm_formula, data = data_train)
-
-# Explain predictions
-p <- mean(data_train[, y_var])
-
-# Empirical approach
-explain1 <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- approach = "empirical",
- prediction_zero = p,
- n_samples = 1e2
-)
-
-# Gaussian approach
-explain2 <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- approach = "gaussian",
- prediction_zero = p,
- n_samples = 1e2
-)
-
-# Gaussian copula approach
-explain3 <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- approach = "copula",
- prediction_zero = p,
- n_samples = 1e2
-)
-
-# ctree approach
-explain4 <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- approach = "ctree",
- prediction_zero = p,
- n_samples = 1e2
-)
-
-# Combined approach
-approach <- c("gaussian", "gaussian", "empirical")
-explain5 <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- approach = approach,
- prediction_zero = p,
- n_samples = 1e2
-)
-
-# Print the Shapley values
-print(explain1$shapley_values)
-
-# Plot the results
-if (requireNamespace("ggplot2", quietly = TRUE)) {
- plot(explain1)
- plot(explain1, plot_type = "waterfall")
-}
-
-# Group-wise explanations
-group_list <- list(A = c("Temp", "Month"), B = c("Wind", "Solar.R"))
-
-explain_groups <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- group = group_list,
- approach = "empirical",
- prediction_zero = p,
- n_samples = 1e2
-)
-print(explain_groups$shapley_values)
-
-# Separate and surrogate regression approaches with linear regression models.
-# More complex regression models can be used, and we can use CV to
-# tune the hyperparameters of the regression models and preprocess
-# the data before sending it to the model. See the regression vignette
-# (Shapley value explanations using the regression paradigm) for more
-# details about the `regression_separate` and `regression_surrogate` approaches.
-explain_separate_lm <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- prediction_zero = p,
- approach = "regression_separate",
- regression.model = parsnip::linear_reg()
-)
-
-explain_surrogate_lm <- explain(
- model = model,
- x_explain = x_explain,
- x_train = x_train,
- prediction_zero = p,
- approach = "regression_surrogate",
- regression.model = parsnip::linear_reg()
-)
-
-}
-\references{
-Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent:
-More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502.
-}
-\author{
-Martin Jullum, Lars Henry Berge Olsen
+Gathers the final output to create the explanation object
}
diff --git a/man/finalize_explanation_forecast.Rd b/man/finalize_explanation_forecast.Rd
new file mode 100644
index 000000000..924899fdd
--- /dev/null
+++ b/man/finalize_explanation_forecast.Rd
@@ -0,0 +1,215 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/finalize_explanation.R
+\name{finalize_explanation_forecast}
+\alias{finalize_explanation_forecast}
+\title{Computes the Shapley values given \code{v(S)}}
+\usage{
+finalize_explanation_forecast(vS_list, internal)
+}
+\arguments{
+\item{vS_list}{List
+Output from \code{\link[=compute_vS]{compute_vS()}}}
+
+\item{internal}{List.
+Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
+}
+\value{
+Object of class \code{c("shapr", "list")}. Contains the following items:
+\describe{
+\item{shapley_values}{data.table with the estimated Shapley values with explained observation in the rows and
+features along the columns.
+The column \code{none} is the prediction not devoted to any of the features (given by the argument \code{prediction_zero})}
+\item{shapley_values_sd}{data.table with the standard deviation of the Shapley values reflecting the uncertainty.
+Note that this only reflects the coalition sampling part of the kernelSHAP procedure, and is therefore by
+definition 0 when all coalitions is used.
+Only present when \code{adaptive = TRUE} and \code{adaptive_arguments$compute_sd=TRUE}.}
+\item{internal}{List with the different parameters, data, functions and other output used internally.}
+\item{pred_explain}{Numeric vector with the predictions for the explained observations}
+\item{MSEv}{List with the values of the MSEv evaluation criterion for the approach. See the
+\href{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#msev-evaluation-criterion
+}{MSEv evaluation section in the vignette for details}.}
+\item{timing}{List containing timing information for the different parts of the computation.
+\code{init_time} and \code{end_time} gives the time stamps for the start and end of the computation.
+\code{total_time_secs} gives the total time in seconds for the complete execution of \code{explain()}.
+\code{main_timing_secs} gives the time in seconds for the main computations.
+\code{iter_timing_secs} gives for each iteration of the adaptive estimation, the time spent on the different parts
+adaptive estimation routine.}
+}
+}
+\description{
+Computes dependence-aware Shapley values for observations in \code{x_explain} from the specified
+\code{model} by using the method specified in \code{approach} to estimate the conditional expectation.
+}
+\details{
+The \code{shapr} package implements kernelSHAP estimation of dependence-aware Shapley values with
+eight different Monte Carlo-based approaches for estimating the conditional distributions of the data, namely
+\code{"empirical"}, \code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}.
+\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"}.
+It is also possible to combine the different approaches, see the vignettes for more information.
+
+The package allows for parallelized computation with progress updates through the tightly connected
+\link[future:future]{future::future} and \link[progressr:progressr]{progressr::progressr} packages. See the examples below.
+For adaptive estimation (\code{adaptive=TRUE}), intermediate results may also be printed to the console
+(according to the \code{verbose} argument).
+Moreover, the intermediate results are written to disk.
+This combined with adaptive estimation with (optional) intermediate results printed to the console (and temporary
+written to disk, and batch computing of the v(S) values, enables fast and accurate estimation of the Shapley values
+in a memory friendly manner.
+}
+\examples{
+
+# Load example data
+data("airquality")
+airquality <- airquality[complete.cases(airquality), ]
+x_var <- c("Solar.R", "Wind", "Temp", "Month")
+y_var <- "Ozone"
+
+# Split data into test- and training data
+data_train <- head(airquality, -3)
+data_explain <- tail(airquality, 3)
+
+x_train <- data_train[, x_var]
+x_explain <- data_explain[, x_var]
+
+# Fit a linear model
+lm_formula <- as.formula(paste0(y_var, " ~ ", paste0(x_var, collapse = " + ")))
+model <- lm(lm_formula, data = data_train)
+
+# Explain predictions
+p <- mean(data_train[, y_var])
+
+\dontrun{
+# (Optionally) enable parallelization via the future package
+if (requireNamespace("future", quietly = TRUE)) {
+ future::plan("multisession", workers = 2)
+}
+}
+
+# (Optionally) enable progress updates within every iteration via the progressr package
+if (requireNamespace("progressr", quietly = TRUE)) {
+ progressr::handlers(global = TRUE)
+}
+
+# Empirical approach
+explain1 <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "empirical",
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+
+# Gaussian approach
+explain2 <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+
+# Gaussian copula approach
+explain3 <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "copula",
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+
+# ctree approach
+explain4 <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "ctree",
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+
+# Combined approach
+approach <- c("gaussian", "gaussian", "empirical")
+explain5 <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = approach,
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+
+# Print the Shapley values
+print(explain1$shapley_values)
+
+# Plot the results
+if (requireNamespace("ggplot2", quietly = TRUE)) {
+ plot(explain1)
+ plot(explain1, plot_type = "waterfall")
+}
+
+# Group-wise explanations
+group_list <- list(A = c("Temp", "Month"), B = c("Wind", "Solar.R"))
+
+explain_groups <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ group = group_list,
+ approach = "empirical",
+ prediction_zero = p,
+ n_MC_samples = 1e2
+)
+print(explain_groups$shapley_values)
+
+# Separate and surrogate regression approaches with linear regression models.
+# More complex regression models can be used, and we can use CV to
+# tune the hyperparameters of the regression models and preprocess
+# the data before sending it to the model. See the regression vignette
+# (Shapley value explanations using the regression paradigm) for more
+# details about the `regression_separate` and `regression_surrogate` approaches.
+explain_separate_lm <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ prediction_zero = p,
+ approach = "regression_separate",
+ regression.model = parsnip::linear_reg()
+)
+
+explain_surrogate_lm <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ prediction_zero = p,
+ approach = "regression_surrogate",
+ regression.model = parsnip::linear_reg()
+)
+
+## Adaptive estimation
+# For illustration purposes only. By default not used for such small dimensions as here
+
+# Gaussian approach
+explain_adaptive <- explain(
+ model = model,
+ x_explain = x_explain,
+ x_train = x_train,
+ approach = "gaussian",
+ prediction_zero = p,
+ n_MC_samples = 1e2,
+ adaptive = TRUE,
+ adaptive_arguments = list(initial_n_coalitions = 10)
+)
+
+}
+\references{
+Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent:
+More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502.
+}
+\author{
+Martin Jullum, Lars Henry Berge Olsen
+}
diff --git a/man/get_adaptive_arguments_default.Rd b/man/get_adaptive_arguments_default.Rd
new file mode 100644
index 000000000..2b815cadf
--- /dev/null
+++ b/man/get_adaptive_arguments_default.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/setup.R
+\name{get_adaptive_arguments_default}
+\alias{get_adaptive_arguments_default}
+\title{Function to specify arguments of the adaptive estimation procedure}
+\usage{
+get_adaptive_arguments_default(
+ internal,
+ initial_n_coalitions = ceiling(min(200, max(5, internal$parameters$n_features,
+ (2^internal$parameters$n_features)/10))),
+ fixed_n_coalitions_per_iter = NULL,
+ max_iter = 20,
+ convergence_tolerance = 0.02,
+ reduction_factor_vec = c(seq(0.1, 1, by = 0.1), rep(1, max_iter - 10)),
+ n_boot_samps = 100,
+ compute_sd = isTRUE(internal$parameters$adaptive),
+ max_batch_size = 10,
+ min_n_batches = 10,
+ saving_path = tempfile("shapr_obj_", fileext = ".rds")
+)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+
+\item{initial_n_coalitions}{Integer. Number of coalitions to use in the first estimation iteration.}
+
+\item{fixed_n_coalitions_per_iter}{Integer. Number of \code{n_coalitions} to use in each iteration.
+\code{NULL} (default) means setting it based on estimates based on a set convergence threshold.}
+
+\item{max_iter}{Integer. Maximum number of estimation iterations}
+
+\item{convergence_tolerance}{Numeric. The t variable in the convergence threshold formula on page 6 in the paper
+Covert and Lee (2021), 'Improving KernelSHAP: Practical Shapley Value Estimation via Linear Regression'
+https://arxiv.org/pdf/2012.01536. Smaller values requires more coalitions before convergence is reached.}
+
+\item{reduction_factor_vec}{Numeric vector. The number of \code{n_coalitions} that must be used to reach convergence
+in the next iteration is estimated.
+The number of \code{n_coalitions} actually used in the next iteration is set to this estimate multiplied by
+\code{reduction_factor_vec[i]} for iteration \code{i}.
+It is wise to start with smaller numbers to avoid using too many \code{n_coalitions} due to uncertain estimates in
+the first iterations.}
+
+\item{n_boot_samps}{Integer. The number of bootstrapped samples (i.e. samples with replacement) from the set of all
+coalitions used to estimate the standard deviations of the Shapley value estimates.}
+
+\item{compute_sd}{Logical. Whether to estimate the standard deviations of the Shapley value estimates.}
+
+\item{max_batch_size}{Integer. The maximum number of coalitions to estimate simultaneously within each iteration.
+A larger numbers requires more memory, but may have a slight computational advantage.}
+
+\item{min_n_batches}{Integer. The minimum number of batches to split the computation into within each iteration.
+Larger numbers gives more frequent progress updates. If parallelization is applied, this should be set no smaller
+than the number of parallel workers.}
+
+\item{saving_path}{String.
+The path to the directory where the results of the adaptive estimation procedure should be saved.
+Defaults to a temporary directory.}
+}
+\description{
+Function to specify arguments of the adaptive estimation procedure
+}
+\details{
+The functions sets default values for the adaptive estimation procedure, according to the function defaults.
+If the argument \code{adaptive} of \code{\link[=explain]{explain()}} is FALSE, it sets parameters corresponding to the use of a
+non-adaptive estimation procedure
+}
+\author{
+Martin Jullum
+}
diff --git a/man/observation_impute.Rd b/man/observation_impute.Rd
index 813869b28..690879315 100644
--- a/man/observation_impute.Rd
+++ b/man/observation_impute.Rd
@@ -10,24 +10,28 @@ observation_impute(
x_train,
x_explain,
empirical.eta = 0.7,
- n_samples = 1000
+ n_MC_samples = 1000
)
}
\arguments{
\item{W_kernel}{Numeric matrix. Contains all nonscaled weights between training and test
-observations for all feature combinations. The dimension equals \verb{n_train x m}.}
+observations for all coalitions. The dimension equals \verb{n_train x m}.}
-\item{S}{Integer matrix of dimension \verb{n_combinations x m}, where \code{n_combinations}
-and \code{m} equals the total number of sampled/non-sampled feature combinations and
+\item{S}{Integer matrix of dimension \verb{n_coalitions x m}, where \code{n_coalitions}
+and \code{m} equals the total number of sampled/non-sampled coalitions and
the total number of unique features, respectively. Note that \code{m = ncol(x_train)}.}
\item{x_train}{Numeric matrix}
\item{x_explain}{Numeric matrix}
-\item{n_samples}{Positive integer.
-Indicating the maximum number of samples to use in the
-Monte Carlo integration for every conditional expectation. See also details.}
+\item{n_MC_samples}{Positive integer.
+Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation.
+For \code{approach="ctree"}, \code{n_MC_samples} corresponds to the number of samples
+from the leaf node (see an exception related to the \code{ctree.sample} argument \code{\link[=setup_approach.ctree]{setup_approach.ctree()}}).
+For \code{approach="empirical"}, \code{n_MC_samples} is the \eqn{K} parameter in equations (14-15) of
+Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
+\code{empirical.eta} argument \code{\link[=setup_approach.empirical]{setup_approach.empirical()}}.}
}
\value{
data.table
diff --git a/man/observation_impute_cpp.Rd b/man/observation_impute_cpp.Rd
index 077b419ab..ffd4838d3 100644
--- a/man/observation_impute_cpp.Rd
+++ b/man/observation_impute_cpp.Rd
@@ -17,7 +17,7 @@ i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}.}
\item{xtest}{Numeric matrix. Represents a single test observation.}
-\item{S}{Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
+\item{S}{Integer matrix of dimension \code{n_coalitions x m}, where \code{n_coalitions} equals
the total number of sampled/non-sampled feature combinations and \code{m} equals
the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
for more information.}
diff --git a/man/plot.shapr.Rd b/man/plot.shapr.Rd
index f45485d4e..3c93a2697 100644
--- a/man/plot.shapr.Rd
+++ b/man/plot.shapr.Rd
@@ -129,7 +129,7 @@ x <- explain(
x_train = x_train,
approach = "empirical",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
if (requireNamespace("ggplot2", quietly = TRUE)) {
@@ -179,7 +179,7 @@ x <- explain(
x_train = x_train,
approach = "ctree",
prediction_zero = p,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
if (requireNamespace("ggplot2", quietly = TRUE)) {
diff --git a/man/plot_MSEv_eval_crit.Rd b/man/plot_MSEv_eval_crit.Rd
index 24c3fc2d0..cd7792070 100644
--- a/man/plot_MSEv_eval_crit.Rd
+++ b/man/plot_MSEv_eval_crit.Rd
@@ -7,7 +7,7 @@
plot_MSEv_eval_crit(
explanation_list,
index_x_explain = NULL,
- id_combination = NULL,
+ id_coalition = NULL,
CI_level = if (length(explanation_list[[1]]$pred_explain) < 20) NULL else 0.95,
geom_col_width = 0.9,
plot_type = "overall"
@@ -23,29 +23,29 @@ Which of the test observations to plot. E.g. if you have
explained 10 observations using \code{\link[=explain]{explain()}}, you can generate a plot for the first 5
observations by setting \code{index_x_explain = 1:5}.}
-\item{id_combination}{Integer vector. Which of the combinations (coalitions) to plot.
-E.g. if you used \code{n_combinations = 16} in \code{\link[=explain]{explain()}}, you can generate a plot for the
-first 5 combinations and the 10th by setting \code{id_combination = c(1:5, 10)}.}
+\item{id_coalition}{Integer vector. Which of the coalitions to plot.
+E.g. if you used \code{n_coalitions = 16} in \code{\link[=explain]{explain()}}, you can generate a plot for the
+first 5 coalitions and the 10th by setting \code{id_coalition = c(1:5, 10)}.}
\item{CI_level}{Positive numeric between zero and one. Default is \code{0.95} if the number of observations to explain is
larger than 20, otherwise \code{CI_level = NULL}, which removes the confidence intervals. The level of the approximate
-confidence intervals for the overall MSEv and the MSEv_combination. The confidence intervals are based on that
+confidence intervals for the overall MSEv and the MSEv_coalition. The confidence intervals are based on that
the MSEv scores are means over the observations/explicands, and that means are approximation normal. Since the
standard deviations are estimated, we use the quantile t from the T distribution with N_explicands - 1 degrees of
freedom corresponding to the provided level. Here, N_explicands is the number of observations/explicands.
-MSEv ± t\emph{SD(MSEv)/sqrt(N_explicands). Note that the \code{explain()} function already scales the standard deviation by
-sqrt(N_explicands), thus, the CI are MSEv ± t}MSEv_sd, where the values MSEv and MSEv_sd are extracted from the
+MSEv +/- t\emph{SD(MSEv)/sqrt(N_explicands). Note that the \code{explain()} function already scales the standard deviation by
+sqrt(N_explicands), thus, the CI are MSEv \/- t}MSEv_sd, where the values MSEv and MSEv_sd are extracted from the
MSEv data.tables in the objects in the \code{explanation_list}.}
\item{geom_col_width}{Numeric. Bar width. By default, set to 90\% of the \code{\link[ggplot2:resolution]{ggplot2::resolution()}} of the data.}
\item{plot_type}{Character vector. The possible options are "overall" (default), "comb", and "explicand".
If \code{plot_type = "overall"}, then the plot (one bar plot) associated with the overall MSEv evaluation criterion
-for each method is created, i.e., when averaging over both the combinations/coalitions and observations/explicands.
+for each method is created, i.e., when averaging over both the coalitions and observations/explicands.
If \code{plot_type = "comb"}, then the plots (one line plot and one bar plot) associated with the MSEv evaluation
-criterion for each combination/coalition are created, i.e., when we only average over the observations/explicands.
+criterion for each coalition are created, i.e., when we only average over the observations/explicands.
If \code{plot_type = "explicand"}, then the plots (one line plot and one bar plot) associated with the MSEv evaluation
-criterion for each observations/explicands are created, i.e., when we only average over the combinations/coalitions.
+criterion for each observations/explicands are created, i.e., when we only average over the coalitions.
If \code{plot_type} is a vector of one or several of "overall", "comb", and "explicand", then the associated plots are
created.}
}
@@ -57,8 +57,8 @@ of \code{\link[ggplot2:ggplot]{ggplot2::ggplot()}} objects based on the \code{pl
Make plots to visualize and compare the MSEv evaluation criterion for a list of
\code{\link[=explain]{explain()}} objects applied to the same data and model. The function creates
bar plots and line plots with points to illustrate the overall MSEv evaluation
-criterion, but also for each observation/explicand and combination by only averaging over
-the combinations and observations/explicands, respectively.
+criterion, but also for each observation/explicand and coalition by only averaging over
+the coalitions and observations/explicands, respectively.
}
\examples{
# Load necessary librarieslibrary(xgboost)
@@ -99,7 +99,7 @@ explanation_independence <- explain(
x_train = x_train,
approach = "independence",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Gaussian 1e1 approach
@@ -109,7 +109,7 @@ explanation_gaussian_1e1 <- explain(
x_train = x_train,
approach = "gaussian",
prediction_zero = prediction_zero,
- n_samples = 1e1
+ n_MC_samples = 1e1
)
# Gaussian 1e2 approach
@@ -119,7 +119,7 @@ explanation_gaussian_1e2 <- explain(
x_train = x_train,
approach = "gaussian",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# ctree approach
@@ -129,7 +129,7 @@ explanation_ctree <- explain(
x_train = x_train,
approach = "ctree",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Combined approach
@@ -139,7 +139,7 @@ explanation_combined <- explain(
x_train = x_train,
approach = c("gaussian", "independence", "ctree"),
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Create a list of explanations with names
@@ -152,24 +152,24 @@ explanation_list_named <- list(
)
if (requireNamespace("ggplot2", quietly = TRUE)) {
- # Create the default MSEv plot where we average over both the combinations and observations
+ # Create the default MSEv plot where we average over both the coalitions and observations
# with approximate 95\% confidence intervals
plot_MSEv_eval_crit(explanation_list_named, CI_level = 0.95, plot_type = "overall")
- # Can also create plots of the MSEv criterion averaged only over the combinations or observations.
+ # Can also create plots of the MSEv criterion averaged only over the coalitions or observations.
MSEv_figures <- plot_MSEv_eval_crit(explanation_list_named,
CI_level = 0.95,
plot_type = c("overall", "comb", "explicand")
)
MSEv_figures$MSEv_bar
- MSEv_figures$MSEv_combination_bar
+ MSEv_figures$MSEv_coalition_bar
MSEv_figures$MSEv_explicand_bar
- # When there are many combinations or observations, then it can be easier to look at line plots
- MSEv_figures$MSEv_combination_line_point
+ # When there are many coalitions or observations, then it can be easier to look at line plots
+ MSEv_figures$MSEv_coalition_line_point
MSEv_figures$MSEv_explicand_line_point
- # We can specify which observations or combinations to plot
+ # We can specify which observations or coalitions to plot
plot_MSEv_eval_crit(explanation_list_named,
plot_type = "explicand",
index_x_explain = c(1, 3:4, 6),
@@ -177,9 +177,9 @@ if (requireNamespace("ggplot2", quietly = TRUE)) {
)$MSEv_explicand_bar
plot_MSEv_eval_crit(explanation_list_named,
plot_type = "comb",
- id_combination = c(3, 4, 9, 13:15),
+ id_coalition = c(3, 4, 9, 13:15),
CI_level = 0.95
- )$MSEv_combination_bar
+ )$MSEv_coalition_bar
# We can alter the figures if other palette schemes or design is wanted
bar_text_n_decimals <- 1
diff --git a/man/plot_SV_several_approaches.Rd b/man/plot_SV_several_approaches.Rd
index 274b1a608..3ba1dc13a 100644
--- a/man/plot_SV_several_approaches.Rd
+++ b/man/plot_SV_several_approaches.Rd
@@ -111,7 +111,7 @@ explanation_independence <- explain(
x_train = x_train,
approach = "independence",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Empirical approach
@@ -121,7 +121,7 @@ explanation_empirical <- explain(
x_train = x_train,
approach = "empirical",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Gaussian 1e1 approach
@@ -131,7 +131,7 @@ explanation_gaussian_1e1 <- explain(
x_train = x_train,
approach = "gaussian",
prediction_zero = prediction_zero,
- n_samples = 1e1
+ n_MC_samples = 1e1
)
# Gaussian 1e2 approach
@@ -141,7 +141,7 @@ explanation_gaussian_1e2 <- explain(
x_train = x_train,
approach = "gaussian",
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Combined approach
@@ -151,7 +151,7 @@ explanation_combined <- explain(
x_train = x_train,
approach = c("gaussian", "ctree", "empirical"),
prediction_zero = prediction_zero,
- n_samples = 1e2
+ n_MC_samples = 1e2
)
# Create a list of explanations with names
diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd
index d7d6d7f39..fd8dcac7a 100644
--- a/man/prepare_data.Rd
+++ b/man/prepare_data.Rd
@@ -41,10 +41,11 @@ prepare_data(internal, index_features = NULL, ...)
\method{prepare_data}{vaeac}(internal, index_features = NULL, ...)
}
\arguments{
-\item{internal}{Not used.}
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
+\item{index_features}{Positive integer vector. Specifies the id_coalition to
+apply to the present method. \code{NULL} means all coalitions. Only used internally.}
\item{...}{Currently not used.}
}
diff --git a/man/prepare_data_copula_cpp.Rd b/man/prepare_data_copula_cpp.Rd
index ca901031d..ce3aafeb3 100644
--- a/man/prepare_data_copula_cpp.Rd
+++ b/man/prepare_data_copula_cpp.Rd
@@ -15,7 +15,7 @@ prepare_data_copula_cpp(
)
}
\arguments{
-\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_samples}, \code{n_features}) containing samples from the
+\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_MC_samples}, \code{n_features}) containing samples from the
univariate standard normal.}
\item{x_explain_mat}{arma::mat. Matrix of dimension (\code{n_explain}, \code{n_features}) containing the observations
@@ -27,7 +27,7 @@ transformed to a standardized normal distribution.}
\item{x_train_mat}{arma::mat. Matrix of dimension (\code{n_train}, \code{n_features}) containing the training observations.}
-\item{S}{arma::mat. Matrix of dimension (\code{n_combinations}, \code{n_features}) containing binary representations of
+\item{S}{arma::mat. Matrix of dimension (\code{n_coalitions}, \code{n_features}) containing binary representations of
the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
This is not a problem internally in shapr as the empty and grand coalitions treated differently.}
@@ -39,8 +39,8 @@ between all pairs of features after being transformed using the Gaussian transfo
transformed to a standardized normal distribution.}
}
\value{
-An arma::cube/3D array of dimension (\code{n_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where
-the columns (\emph{,j,}) are matrices of dimension (\code{n_samples}, \code{n_features}) containing the conditional Gaussian
+An arma::cube/3D array of dimension (\code{n_MC_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where
+the columns (\emph{,j,}) are matrices of dimension (\code{n_MC_samples}, \code{n_features}) containing the conditional Gaussian
copula MC samples for each explicand and coalition on the original scale.
}
\description{
diff --git a/man/prepare_data_gaussian_cpp.Rd b/man/prepare_data_gaussian_cpp.Rd
index b24b431e6..095769cf0 100644
--- a/man/prepare_data_gaussian_cpp.Rd
+++ b/man/prepare_data_gaussian_cpp.Rd
@@ -7,13 +7,13 @@
prepare_data_gaussian_cpp(MC_samples_mat, x_explain_mat, S, mu, cov_mat)
}
\arguments{
-\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_samples}, \code{n_features}) containing samples from the
+\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_MC_samples}, \code{n_features}) containing samples from the
univariate standard normal.}
\item{x_explain_mat}{arma::mat. Matrix of dimension (\code{n_explain}, \code{n_features}) containing the observations
to explain.}
-\item{S}{arma::mat. Matrix of dimension (\code{n_combinations}, \code{n_features}) containing binary representations of
+\item{S}{arma::mat. Matrix of dimension (\code{n_coalitions}, \code{n_features}) containing binary representations of
the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
This is not a problem internally in shapr as the empty and grand coalitions treated differently.}
@@ -23,8 +23,8 @@ This is not a problem internally in shapr as the empty and grand coalitions trea
between all pairs of features.}
}
\value{
-An arma::cube/3D array of dimension (\code{n_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where
-the columns (\emph{,j,}) are matrices of dimension (\code{n_samples}, \code{n_features}) containing the conditional Gaussian
+An arma::cube/3D array of dimension (\code{n_MC_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where
+the columns (\emph{,j,}) are matrices of dimension (\code{n_MC_samples}, \code{n_features}) containing the conditional Gaussian
MC samples for each explicand and coalition.
}
\description{
diff --git a/man/prepare_next_iteration.Rd b/man/prepare_next_iteration.Rd
new file mode 100644
index 000000000..9a23bd2d3
--- /dev/null
+++ b/man/prepare_next_iteration.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prepare_next_iteration.R
+\name{prepare_next_iteration}
+\alias{prepare_next_iteration}
+\title{Prepares the next iteration of the adaptive sampling algorithm}
+\usage{
+prepare_next_iteration(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Prepares the next iteration of the adaptive sampling algorithm
+}
+\keyword{internal}
diff --git a/man/print_iter.Rd b/man/print_iter.Rd
new file mode 100644
index 000000000..abab85a3b
--- /dev/null
+++ b/man/print_iter.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/print_iter.R
+\name{print_iter}
+\alias{print_iter}
+\title{Prints iterative information}
+\usage{
+print_iter(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Prints iterative information
+}
+\keyword{internal}
diff --git a/man/regression.check_parameters.Rd b/man/regression.check_parameters.Rd
index fbe747374..73930ae26 100644
--- a/man/regression.check_parameters.Rd
+++ b/man/regression.check_parameters.Rd
@@ -9,7 +9,8 @@ regression.check_parameters(internal)
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
}
\value{
The same \code{internal} list, but added logical indicator \code{internal$parameters$regression.tune}
diff --git a/man/regression.check_sur_n_comb.Rd b/man/regression.check_sur_n_comb.Rd
index 1ede6d346..3160bdae9 100644
--- a/man/regression.check_sur_n_comb.Rd
+++ b/man/regression.check_sur_n_comb.Rd
@@ -4,18 +4,22 @@
\alias{regression.check_sur_n_comb}
\title{Check the \code{regression.surrogate_n_comb} parameter}
\usage{
-regression.check_sur_n_comb(regression.surrogate_n_comb, used_n_combinations)
+regression.check_sur_n_comb(regression.surrogate_n_comb, n_coalitions)
}
\arguments{
-\item{regression.surrogate_n_comb}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the
-number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
-"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory
-in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all
-\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the
-user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique
-coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.}
+\item{regression.surrogate_n_comb}{Integer.
+(default is \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions}) specifying the
+number of unique coalitions to apply to each training observation. Maximum allowed value is
+"\code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions} - 2".
+By default, we use all coalitions, but this can take a lot of memory in larger dimensions.
+Note that by "all", we mean all coalitions chosen by \code{shapr} to be used.
+This will be all \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in
+the exact mode.
+If the user sets a lower value than \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions},
+then we sample this amount of unique coalitions separately for each training observations.
+That is, on average, all coalitions should be equally trained.}
-\item{used_n_combinations}{Integer. The number of used combinations (including the empty and grand coalitions).}
+\item{n_coalitions}{Integer. The number of used coalitions (including the empty and grand coalition).}
}
\description{
Check that \code{regression.surrogate_n_comb} is either NULL or a valid integer.
diff --git a/man/regression.cv_message.Rd b/man/regression.cv_message.Rd
index 145e514a0..2826b11bb 100644
--- a/man/regression.cv_message.Rd
+++ b/man/regression.cv_message.Rd
@@ -4,7 +4,12 @@
\alias{regression.cv_message}
\title{Produce message about which batch prepare_data is working on}
\usage{
-regression.cv_message(regression.results, regression.grid, n_cv = 10)
+regression.cv_message(
+ regression.results,
+ regression.grid,
+ n_cv = 10,
+ current_comb
+)
}
\arguments{
\item{regression.results}{The results of the CV procedures.}
diff --git a/man/regression.get_tune.Rd b/man/regression.get_tune.Rd
index 7c5440741..148c36a93 100644
--- a/man/regression.get_tune.Rd
+++ b/man/regression.get_tune.Rd
@@ -18,8 +18,8 @@ is also a valid input. It is essential to include the package prefix if the pack
The data.frame must contain the possible hyperparameter value combinations to try.
The column names must match the names of the tuneable parameters specified in \code{regression.model}.
If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data
-for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
vignette for several examples.
Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string
containing an R function. For example,
diff --git a/man/regression.get_y_hat.Rd b/man/regression.get_y_hat.Rd
index 6b03d3d49..ff02d4cb4 100644
--- a/man/regression.get_y_hat.Rd
+++ b/man/regression.get_y_hat.Rd
@@ -9,7 +9,8 @@ regression.get_y_hat(internal, model, predict_model)
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
\item{model}{Objects.
The model object that ought to be explained.
diff --git a/man/regression.prep_message_batch.Rd b/man/regression.prep_message_batch.Rd
deleted file mode 100644
index 9b8a942e2..000000000
--- a/man/regression.prep_message_batch.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/approach_regression_separate.R
-\name{regression.prep_message_batch}
-\alias{regression.prep_message_batch}
-\title{Produce message about which batch prepare_data is working on}
-\usage{
-regression.prep_message_batch(internal, index_features)
-}
-\arguments{
-\item{internal}{List.
-Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
-
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
-}
-\description{
-Produce message about which batch prepare_data is working on
-}
-\author{
-Lars Henry Berge Olsen
-}
-\keyword{internal}
diff --git a/man/regression.prep_message_comb.Rd b/man/regression.prep_message_comb.Rd
deleted file mode 100644
index 84739b82a..000000000
--- a/man/regression.prep_message_comb.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/approach_regression_separate.R
-\name{regression.prep_message_comb}
-\alias{regression.prep_message_comb}
-\title{Produce message about which combination prepare_data is working on}
-\usage{
-regression.prep_message_comb(internal, index_features, comb_idx)
-}
-\arguments{
-\item{internal}{List.
-Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
-
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
-
-\item{comb_idx}{Integer. The index of the combination in a specific batch.}
-}
-\description{
-Produce message about which combination prepare_data is working on
-}
-\author{
-Lars Henry Berge Olsen
-}
-\keyword{internal}
diff --git a/man/regression.separate_time_mess.Rd b/man/regression.separate_time_mess.Rd
deleted file mode 100644
index cf0438000..000000000
--- a/man/regression.separate_time_mess.Rd
+++ /dev/null
@@ -1,15 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/approach_regression_separate.R
-\name{regression.separate_time_mess}
-\alias{regression.separate_time_mess}
-\title{Produce time message for separate regression}
-\usage{
-regression.separate_time_mess()
-}
-\description{
-Produce time message for separate regression
-}
-\author{
-Lars Henry Berge Olsen
-}
-\keyword{internal}
diff --git a/man/regression.surrogate_aug_data.Rd b/man/regression.surrogate_aug_data.Rd
index 8ebd0ccbd..14c97aabe 100644
--- a/man/regression.surrogate_aug_data.Rd
+++ b/man/regression.surrogate_aug_data.Rd
@@ -11,7 +11,7 @@ regression.surrogate_aug_data(
index_features = NULL,
augment_masks_as_factor = FALSE,
augment_include_grand = FALSE,
- augment_add_id_comb = FALSE,
+ augment_add_id_coal = FALSE,
augment_comb_prob = NULL,
augment_weights = NULL
)
@@ -19,7 +19,8 @@ regression.surrogate_aug_data(
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
\item{x}{Data.table containing the data. Either the training data or the explicands. If \code{x} is the explicands,
then \code{index_features} must be provided.}
@@ -34,20 +35,20 @@ to factors. If \code{FALSE}, then the binary masks are numerics.}
\item{augment_include_grand}{Logical (default is \code{FALSE}). If \code{TRUE}, then the grand coalition is included.
If \code{index_features} are provided, then \code{augment_include_grand} has no effect. Note that if we sample the
-combinations then the grand coalition is equally likely to be samples as the other coalitions (or weighted if
+coalitions then the grand coalition is equally likely to be samples as the other coalitions (or weighted if
\code{augment_comb_prob} is provided).}
-\item{augment_add_id_comb}{Logical (default is \code{FALSE}). If \code{TRUE}, an additional column is adding containing
+\item{augment_add_id_coal}{Logical (default is \code{FALSE}). If \code{TRUE}, an additional column is adding containing
which coalition was applied.}
\item{augment_comb_prob}{Array of numerics (default is \code{NULL}). The length of the array must match the number of
-combinations being considered, where each entry specifies the probability of sampling the corresponding coalition.
+coalitions being considered, where each entry specifies the probability of sampling the corresponding coalition.
This is useful if we want to generate more training data for some specific coalitions. One possible choice would be
-\code{augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_combinations] else NULL}.}
+\code{augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_coalitions] else NULL}.}
\item{augment_weights}{String (optional). Specifying which type of weights to add to the observations.
If \code{NULL} (default), then no weights are added. If \code{"Shapley"}, then the Shapley weights for the different
-combinations are added to corresponding observations where the coalitions was applied. If \code{uniform}, then
+coalitions are added to corresponding observations where the coalitions was applied. If \code{uniform}, then
all observations get an equal weight of one.}
}
\value{
diff --git a/man/regression.train_model.Rd b/man/regression.train_model.Rd
index 8ee6b669a..60f227175 100644
--- a/man/regression.train_model.Rd
+++ b/man/regression.train_model.Rd
@@ -7,14 +7,15 @@
regression.train_model(
x,
seed = 1,
- verbose = 0,
+ verbose = NULL,
regression.model = parsnip::linear_reg(),
regression.tune = FALSE,
regression.tune_values = NULL,
regression.vfold_cv_para = NULL,
regression.recipe_func = NULL,
regression.response_var = "y_hat",
- regression.surrogate_n_comb = NULL
+ regression.surrogate_n_comb = NULL,
+ current_comb = NULL
)
}
\arguments{
@@ -23,12 +24,24 @@ then \code{index_features} must be provided.}
\item{seed}{Positive integer.
Specifies the seed before any randomness based code is being run.
-If \code{NULL} the seed will be inherited from the calling environment.}
+If \code{NULL} no seed is set in the calling environment.}
-\item{verbose}{An integer specifying the level of verbosity. If \code{0}, \code{shapr} will stay silent.
-If \code{1}, it will print information about performance. If \code{2}, some additional information will be printed out.
-Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} for high verbose.
-TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{regression.model}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e.,
\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models,
@@ -45,8 +58,8 @@ the values provided in \code{regression.tune_values}. Note that no checks are co
The data.frame must contain the possible hyperparameter value combinations to try.
The column names must match the names of the tuneable parameters specified in \code{regression.model}.
If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data
-for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
vignette for several examples.
Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string
containing an R function. For example,
diff --git a/man/sample_ctree.Rd b/man/sample_ctree.Rd
index f95f74383..4bd10b52d 100644
--- a/man/sample_ctree.Rd
+++ b/man/sample_ctree.Rd
@@ -4,13 +4,13 @@
\alias{sample_ctree}
\title{Sample ctree variables from a given conditional inference tree}
\usage{
-sample_ctree(tree, n_samples, x_explain, x_train, n_features, sample)
+sample_ctree(tree, n_MC_samples, x_explain, x_train, n_features, sample)
}
\arguments{
\item{tree}{List. Contains tree which is an object of type ctree built from the party package.
Also contains given_ind, the features to condition upon.}
-\item{n_samples}{Numeric. Indicates how many samples to use for MCMC.}
+\item{n_MC_samples}{Numeric. Indicates how many samples to use for MCMC.}
\item{x_explain}{Matrix, data.frame or data.table with the features of the observation whose
predictions ought to be explained (test data). Dimension \verb{1\\timesp} or \verb{p\\times1}.}
@@ -21,10 +21,10 @@ predictions ought to be explained (test data). Dimension \verb{1\\timesp} or \ve
\item{sample}{Boolean. True indicates that the method samples from the terminal node
of the tree whereas False indicates that the method takes all the observations if it is
-less than n_samples.}
+less than n_MC_samples.}
}
\value{
-data.table with \code{n_samples} (conditional) Gaussian samples
+data.table with \code{n_MC_samples} (conditional) Gaussian samples
}
\description{
Sample ctree variables from a given conditional inference tree
diff --git a/man/save_results.Rd b/man/save_results.Rd
new file mode 100644
index 000000000..fa1536172
--- /dev/null
+++ b/man/save_results.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/save_results.R
+\name{save_results}
+\alias{save_results}
+\title{Saves the itermediate results to disk}
+\usage{
+save_results(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Saves the itermediate results to disk
+}
+\keyword{internal}
diff --git a/man/setup.Rd b/man/setup.Rd
index fce91a6b0..602a92dcb 100644
--- a/man/setup.Rd
+++ b/man/setup.Rd
@@ -8,12 +8,12 @@ setup(
x_train,
x_explain,
approach,
+ paired_shap_sampling = FALSE,
prediction_zero,
output_size = 1,
- n_combinations,
+ max_n_coalitions,
group,
- n_samples,
- n_batches,
+ n_MC_samples,
seed,
keep_samp_for_vS,
feature_specs,
@@ -27,9 +27,14 @@ setup(
explain_y_lags = NULL,
explain_xreg_lags = NULL,
group_lags = NULL,
- timing,
verbose,
+ adaptive = NULL,
+ adaptive_arguments = list(),
+ shapley_reweighting = "none",
is_python = FALSE,
+ testing = FALSE,
+ init_time = NULL,
+ prev_shapr_object = NULL,
...
)
}
@@ -46,6 +51,11 @@ All elements should, either be \code{"gaussian"}, \code{"copula"}, \code{"empiri
\code{"categorical"}, \code{"timeseries"}, \code{"independence"}, \code{"regression_separate"}, or \code{"regression_surrogate"}.
The two regression approaches can not be combined with any other approach. See details for more information.}
+\item{paired_shap_sampling}{Logical.
+If \code{TRUE} (default), paired versions of all sampled coalitions are also included in the computation.
+That is, if there are 5 features and e.g. coalitions (1,3,5) are sampled, then also coalition (2,4) is used for
+computing the Shapley values. This is done to reduce the variance of the Shapley value estimates.}
+
\item{prediction_zero}{Numeric.
The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any
features.
@@ -54,11 +64,13 @@ such as the mean of the predictions in the training data are also reasonable.}
\item{output_size}{TODO: Document}
-\item{n_combinations}{Integer.
-If \code{group = NULL}, \code{n_combinations} represents the number of unique feature combinations to sample.
-If \code{group != NULL}, \code{n_combinations} represents the number of unique group combinations to sample.
-If \code{n_combinations = NULL}, the exact method is used and all combinations are considered.
-The maximum number of combinations equals \code{2^m}, where \code{m} is the number of features.}
+\item{max_n_coalitions}{Integer.
+The upper limit on the number of unique feature/group coalitions to use in the adaptive procedure
+(if \code{adaptive = TRUE}).
+If \code{adaptive = FALSE} it represents the number of feature/group coalitions to use directly.
+The quantity refers to the number of unique feature coalitions if \code{group = NULL},
+and group coalitions if \code{group != NULL}.
+\code{max_n_coalitions = NULL} corresponds to \code{max_n_coalitions=2^n_features}.}
\item{group}{List.
If \code{NULL} regular feature wise Shapley values are computed.
@@ -66,25 +78,21 @@ If provided, group wise Shapley values are computed. \code{group} then has lengt
the number of groups. The list element contains character vectors with the features included
in each of the different groups.}
-\item{n_samples}{Positive integer.
-Indicating the maximum number of samples to use in the
-Monte Carlo integration for every conditional expectation. See also details.}
-
-\item{n_batches}{Positive integer (or NULL).
-Specifies how many batches the total number of feature combinations should be split into when calculating the
-contribution function for each test observation.
-The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed,
-which depends on \code{approach} and \code{n_combinations}.
-For models with many features, increasing the number of batches reduces the RAM allocation significantly.
-This typically comes with a small increase in computation time.}
+\item{n_MC_samples}{Positive integer.
+Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation.
+For \code{approach="ctree"}, \code{n_MC_samples} corresponds to the number of samples
+from the leaf node (see an exception related to the \code{ctree.sample} argument \code{\link[=setup_approach.ctree]{setup_approach.ctree()}}).
+For \code{approach="empirical"}, \code{n_MC_samples} is the \eqn{K} parameter in equations (14-15) of
+Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the
+\code{empirical.eta} argument \code{\link[=setup_approach.empirical]{setup_approach.empirical()}}.}
\item{seed}{Positive integer.
Specifies the seed before any randomness based code is being run.
-If \code{NULL} the seed will be inherited from the calling environment.}
+If \code{NULL} no seed is set in the calling environment.}
\item{keep_samp_for_vS}{Logical.
-Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned
-(in \code{internal$output})}
+Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned (in \code{internal$output}).
+Not used for \code{approach="regression_separate"} or \code{approach="regression_surrogate"}.}
\item{feature_specs}{List. The output from \code{\link[=get_model_specs]{get_model_specs()}} or \code{\link[=get_data_specs]{get_data_specs()}}.
Contains the 3 elements:
@@ -94,10 +102,11 @@ Contains the 3 elements:
\item{factor_levels}{Character vector with the levels for any categorical features.}
}}
-\item{MSEv_uniform_comb_weights}{Logical. If \code{TRUE} (default), then the function weights the combinations
-uniformly when computing the MSEv criterion. If \code{FALSE}, then the function use the Shapley kernel weights to
-weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the
-sampling frequency when not all combinations are considered.}
+\item{MSEv_uniform_comb_weights}{Logical.
+If \code{TRUE} (default), then the function weights the coalitions uniformly when computing the MSEv criterion.
+If \code{FALSE}, then the function use the Shapley kernel weights to weight the coalitions when computing the MSEv
+criterion.
+Note that the Shapley kernel weights are replaced by the sampling frequency when not all coalitions are considered.}
\item{type}{Character.
Either "normal" or "forecast" corresponding to function \code{setup()} is called from,
@@ -136,18 +145,70 @@ If \code{xreg != NULL}, denotes the number of lags that should be used for each
If \code{TRUE} all lags of each variable are grouped together and explained as a group.
If \code{FALSE} all lags of each variable are explained individually.}
-\item{timing}{Logical.
-Whether the timing of the different parts of the \code{explain()} should saved in the model object.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
-\item{verbose}{An integer specifying the level of verbosity. If \code{0}, \code{shapr} will stay silent.
-If \code{1}, it will print information about performance. If \code{2}, some additional information will be printed out.
-Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} for high verbose.
-TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.}
+\item{adaptive}{Logical or NULL
+If \code{NULL} (default), the argument is set to \code{TRUE} if there are more than 5 features/groups, and \code{FALSE} otherwise.
+If eventually \code{TRUE}, the Shapley values are estimated adaptively in an iterative manner.
+This provides sufficiently accurate Shapley value estimates faster.
+First an initial number of coalitions is sampled, then bootsrapping is used to estimate the variance of the Shapley
+values.
+A convergence criterion is used to determine if the variances of the Shapley values are sufficently small.
+If the variances are too high, we estimate the number of required samples to reach convergence, and thereby add more
+coalitions.
+The process is repeated until the variances are below the threshold.
+Specifics related to the adaptive process and convergence criterion are set through \code{adaptive_arguments}.}
+
+\item{adaptive_arguments}{Named list.
+Specifices the arguments for the adaptive procedure.
+See \code{\link[=get_adaptive_arguments_default]{get_adaptive_arguments_default()}} for description of the arguments and their default values.}
+
+\item{shapley_reweighting}{String.
+How to reweight the sampling frequency weights in the kernelSHAP solution after sampling, with the aim of reducing
+the randomness and thereby the variance of the Shapley value estimates.
+One of \code{'none'}, \code{'on_N'}, \code{'on_all'}, \code{'on_all_cond'} (default).
+\code{'none'} means no reweighting, i.e. the sampling frequency weights are used as is.
+\code{'on_coal_size'} means the sampling frequencies are averaged over all coalitions of the same size.
+\code{'on_N'} means the sampling frequencies are averaged over all coalitions with the same original sampling
+probabilities.
+\code{'on_all'} means the original sampling probabilities are used for all coalitions.
+\code{'on_all_cond'} means the original sampling probabilities are used for all coalitions, while adjusting for the
+probability that they are sampled at least once.
+This method is preferred as it has performed the best in simulation studies.}
\item{is_python}{Logical. Indicates whether the function is called from the Python wrapper. Default is FALSE which is
never changed when calling the function via \code{explain()} in R. The parameter is later used to disallow
running the AICc-versions of the empirical as that requires data based optimization.}
+\item{testing}{Logical.
+Only use to remove random components like timing from the object output when comparing output with testthat.
+Defaults to \code{FALSE}.}
+
+\item{init_time}{POSIXct object.
+The time when the \code{explain()} function was called, as outputted by \code{Sys.time()}.
+Used to calculate the time it took to run the full \code{explain} call.}
+
+\item{prev_shapr_object}{\code{shapr} object or string.
+If an object of class \code{shapr} is provided or string with a path to where intermediate results are strored,
+then the function will use the previous object to continue the computation.
+This is useful if the computation is interrupted or you want higher accuracy than already obtained, and therefore
+want to continue the adaptive estimation. See the vignette for examples.}
+
\item{...}{Further arguments passed to specific approaches}
}
\description{
diff --git a/man/setup_approach.Rd b/man/setup_approach.Rd
index cf1ee8d0d..8ed371ba5 100644
--- a/man/setup_approach.Rd
+++ b/man/setup_approach.Rd
@@ -71,7 +71,8 @@ setup_approach(internal, ...)
regression.tune_values = NULL,
regression.vfold_cv_para = NULL,
regression.recipe_func = NULL,
- regression.surrogate_n_comb = internal$parameters$used_n_combinations - 2,
+ regression.surrogate_n_comb =
+ internal$iter_list[[length(internal$iter_list)]]$n_coalitions - 2,
...
)
@@ -96,7 +97,8 @@ setup_approach(internal, ...)
)
}
\arguments{
-\item{internal}{Not used.}
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
\item{...}{\code{approach}-specific arguments. See below.}
@@ -123,13 +125,13 @@ Determines minimum value that the sum of the left and right daughter nodes requi
Determines the minimum sum of weights in a terminal node required for a split}
\item{ctree.sample}{Boolean. (default = TRUE)
-If TRUE, then the method always samples \code{n_samples} observations from the leaf nodes (with replacement).
-If FALSE and the number of observations in the leaf node is less than \code{n_samples},
+If TRUE, then the method always samples \code{n_MC_samples} observations from the leaf nodes (with replacement).
+If FALSE and the number of observations in the leaf node is less than \code{n_MC_samples},
the method will take all observations in the leaf.
-If FALSE and the number of observations in the leaf node is more than \code{n_samples},
-the method will sample \code{n_samples} observations (with replacement).
+If FALSE and the number of observations in the leaf node is more than \code{n_MC_samples},
+the method will sample \code{n_MC_samples} observations (with replacement).
This means that there will always be sampling in the leaf unless
-\code{sample} = FALSE AND the number of obs in the node is less than \code{n_samples}.}
+\code{sample} = FALSE AND the number of obs in the node is less than \code{n_MC_samples}.}
\item{empirical.type}{Character. (default = \code{"fixed_sigma"})
Should be equal to either \code{"independence"},\code{"fixed_sigma"}, \code{"AICc_each_k"} \code{"AICc_full"}.
@@ -143,7 +145,7 @@ accounts for 80\\% of the total weight.
\code{eta} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).}
\item{empirical.fixed_sigma}{Positive numeric scalar. (default = 0.1)
-Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations.
+Represents the kernel bandwidth in the distance computation used when conditioning on all different coalitions.
Only used when \code{empirical.type = "fixed_sigma"}}
\item{empirical.n_samples_aicc}{Positive integer. (default = 1000)
@@ -189,8 +191,8 @@ is also a valid input. It is essential to include the package prefix if the pack
The data.frame must contain the possible hyperparameter value combinations to try.
The column names must match the names of the tuneable parameters specified in \code{regression.model}.
If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data
-for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
-Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
+for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
+Using a function allows the hyperparameter values to change based on the size of the coalition See the regression
vignette for several examples.
Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string
containing an R function. For example,
@@ -209,13 +211,17 @@ containing an R function. For example,
\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also
a valid input. It is essential to include the package prefix if the package is not loaded.}
-\item{regression.surrogate_n_comb}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the
-number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
-"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory
-in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all
-\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the
-user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique
-coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.}
+\item{regression.surrogate_n_comb}{Integer.
+(default is \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions}) specifying the
+number of unique coalitions to apply to each training observation. Maximum allowed value is
+"\code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions} - 2".
+By default, we use all coalitions, but this can take a lot of memory in larger dimensions.
+Note that by "all", we mean all coalitions chosen by \code{shapr} to be used.
+This will be all \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in
+the exact mode.
+If the user sets a lower value than \code{internal$iter_list[[length(internal$iter_list)]]$n_coalitions},
+then we sample this amount of unique coalitions separately for each training observations.
+That is, on average, all coalitions should be equally trained.}
\item{timeseries.fixed_sigma_vec}{Numeric. (Default = 2)
Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?}
diff --git a/man/setup_computation.Rd b/man/setup_computation.Rd
index f731787e5..ca128d70b 100644
--- a/man/setup_computation.Rd
+++ b/man/setup_computation.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
+% Please edit documentation in R/shapley_setup.R
\name{setup_computation}
\alias{setup_computation}
\title{Sets up everything for the Shapley values computation in \code{\link[=explain]{explain()}}}
@@ -9,7 +9,8 @@ setup_computation(internal, model, predict_model)
\arguments{
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
\item{model}{Objects.
The model object that ought to be explained.
diff --git a/man/shapley_setup.Rd b/man/shapley_setup.Rd
new file mode 100644
index 000000000..0b96d7871
--- /dev/null
+++ b/man/shapley_setup.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/shapley_setup.R
+\name{shapley_setup}
+\alias{shapley_setup}
+\title{Set up the kernelSHAP framework}
+\usage{
+shapley_setup(internal)
+}
+\arguments{
+\item{internal}{List.
+Not used directly, but passed through from \code{\link[=explain]{explain()}}.}
+}
+\description{
+Set up the kernelSHAP framework
+}
+\keyword{internal}
diff --git a/man/shapley_weights.Rd b/man/shapley_weights.Rd
index 109e68de3..572955a88 100644
--- a/man/shapley_weights.Rd
+++ b/man/shapley_weights.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
+% Please edit documentation in R/shapley_setup.R
\name{shapley_weights}
\alias{shapley_weights}
\title{Calculate Shapley weight}
@@ -9,7 +9,7 @@ shapley_weights(m, N, n_components, weight_zero_m = 10^6)
\arguments{
\item{m}{Positive integer. Total number of features/feature groups.}
-\item{N}{Positive integer. The number of unique combinations when sampling \code{n_components} features/feature
+\item{N}{Positive integer. The number of unique coalitions when sampling \code{n_components} features/feature
groups, without replacement, from a sample space consisting of \code{m} different features/feature groups.}
\item{n_components}{Positive integer. Represents the number of features/feature groups you want to sample from
diff --git a/man/test_predict_model.Rd b/man/test_predict_model.Rd
index f428150e0..40adb5b01 100644
--- a/man/test_predict_model.Rd
+++ b/man/test_predict_model.Rd
@@ -17,7 +17,8 @@ See the documentation of \code{\link[=explain]{explain()}} for details.}
\item{internal}{List.
Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
+The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{iter_list}, \code{timing_list},
+\code{main_timing_list}, \code{output}, \code{iter_timing_list} and \code{iter_results}.}
}
\description{
Model testing function
diff --git a/man/testing_cleanup.Rd b/man/testing_cleanup.Rd
new file mode 100644
index 000000000..3c590807f
--- /dev/null
+++ b/man/testing_cleanup.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/explain.R
+\name{testing_cleanup}
+\alias{testing_cleanup}
+\title{Cleans out certain output arguments to allow perfect reproducability of the output}
+\usage{
+testing_cleanup(output)
+}
+\description{
+Cleans out certain output arguments to allow perfect reproducability of the output
+}
+\author{
+Lars Henry Berge Olsen, Martin Jullum
+}
+\keyword{internal}
diff --git a/man/vaeac_check_mask_gen.Rd b/man/vaeac_check_mask_gen.Rd
index 89b9af1db..92cfa921f 100644
--- a/man/vaeac_check_mask_gen.Rd
+++ b/man/vaeac_check_mask_gen.Rd
@@ -9,8 +9,8 @@ vaeac_check_mask_gen(mask_gen_coalitions, mask_gen_coalitions_prob, x_train)
\arguments{
\item{mask_gen_coalitions}{Matrix (default is \code{NULL}). Matrix containing the coalitions that the
\code{vaeac} model will be trained on, see \code{\link[=specified_masks_mask_generator]{specified_masks_mask_generator()}}. This parameter is used internally
-in \code{shapr} when we only consider a subset of coalitions/combinations, i.e., when
-\code{n_combinations} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+in \code{shapr} when we only consider a subset of coalitions, i.e., when
+\code{n_coalitions} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
when \code{group} is specified in \code{\link[=explain]{explain()}}.}
\item{mask_gen_coalitions_prob}{Numeric array (default is \code{NULL}). Array of length equal to the height
diff --git a/man/vaeac_check_parameters.Rd b/man/vaeac_check_parameters.Rd
index faeb6b8c8..d49550d7c 100644
--- a/man/vaeac_check_parameters.Rd
+++ b/man/vaeac_check_parameters.Rd
@@ -130,8 +130,8 @@ model can do arbitrary conditioning as all coalitions will be trained. \code{mas
\item{mask_gen_coalitions}{Matrix (default is \code{NULL}). Matrix containing the coalitions that the
\code{vaeac} model will be trained on, see \code{\link[=specified_masks_mask_generator]{specified_masks_mask_generator()}}. This parameter is used internally
-in \code{shapr} when we only consider a subset of coalitions/combinations, i.e., when
-\code{n_combinations} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+in \code{shapr} when we only consider a subset of coalitions, i.e., when
+\code{n_coalitions} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
when \code{group} is specified in \code{\link[=explain]{explain()}}.}
\item{mask_gen_coalitions_prob}{Numeric array (default is \code{NULL}). Array of length equal to the height
@@ -163,8 +163,22 @@ Note that additional choices are available if \code{vaeac.save_every_nth_epoch}
\code{vaeac.save_every_nth_epoch = 5}, then \code{vaeac.which_vaeac_model} can also take the values \code{"epoch_5"}, \code{"epoch_10"},
\code{"epoch_15"}, and so on.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{seed}{Positive integer (default is \code{1}). Seed for reproducibility. Specifies the seed before any randomness
based code is being run.}
diff --git a/man/vaeac_check_verbose.Rd b/man/vaeac_check_verbose.Rd
deleted file mode 100644
index 73ab85049..000000000
--- a/man/vaeac_check_verbose.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/approach_vaeac.R
-\name{vaeac_check_verbose}
-\alias{vaeac_check_verbose}
-\title{Function that checks the verbose parameter}
-\usage{
-vaeac_check_verbose(verbose)
-}
-\arguments{
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
-}
-\value{
-The function does not return anything.
-}
-\description{
-Function that checks the verbose parameter
-}
-\author{
-Lars Henry Berge Olsen
-}
-\keyword{internal}
diff --git a/man/vaeac_get_extra_para_default.Rd b/man/vaeac_get_extra_para_default.Rd
index f2229c3b3..27968c11d 100644
--- a/man/vaeac_get_extra_para_default.Rd
+++ b/man/vaeac_get_extra_para_default.Rd
@@ -78,9 +78,10 @@ during the training of the vaeac model. Used in \code{\link[torch:dataloader]{to
\item{vaeac.batch_size_sampling}{Positive integer (default is \code{NULL}) The number of samples to include in
each batch when generating the Monte Carlo samples. If \code{NULL}, then the function generates the Monte Carlo samples
-for the provided coalitions/combinations and all explicands sent to \code{\link[=explain]{explain()}} at the time.
-The number of coalitions are determined by \code{n_batches} in \code{\link[=explain]{explain()}}. We recommend to tweak \code{n_batches}
-rather than \code{vaeac.batch_size_sampling}. Larger batch sizes are often much faster provided sufficient memory.}
+for the provided coalitions and all explicands sent to \code{\link[=explain]{explain()}} at the time.
+The number of coalitions are determined by the \code{n_batches} used by \code{\link[=explain]{explain()}}. We recommend to tweak
+\code{adaptive_arguments$max_batch_size} and \code{adaptive_arguments$min_n_batches}
+rather than \code{vaeac.batch_size_sampling}. Larger batch sizes are often much faster provided sufficient memory.}
\item{vaeac.running_avg_n_values}{Positive integer (default is \code{5}). The number of previous IWAE values to include
when we compute the running means of the IWAE criterion.}
@@ -112,8 +113,8 @@ model can do arbitrary conditioning as all coalitions will be trained. \code{vae
\item{vaeac.mask_gen_coalitions}{Matrix (default is \code{NULL}). Matrix containing the coalitions that the
\code{vaeac} model will be trained on, see \code{\link[=specified_masks_mask_generator]{specified_masks_mask_generator()}}. This parameter is used internally
-in \code{shapr} when we only consider a subset of coalitions/combinations, i.e., when
-\code{n_combinations} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+in \code{shapr} when we only consider a subset of coalitions, i.e., when
+\code{n_coalitions} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
when \code{group} is specified in \code{\link[=explain]{explain()}}.}
\item{vaeac.mask_gen_coalitions_prob}{Numeric array (default is \code{NULL}). Array of length equal to the height
diff --git a/man/vaeac_get_mask_generator_name.Rd b/man/vaeac_get_mask_generator_name.Rd
index 8ea86c356..4fdfeae2b 100644
--- a/man/vaeac_get_mask_generator_name.Rd
+++ b/man/vaeac_get_mask_generator_name.Rd
@@ -14,8 +14,8 @@ vaeac_get_mask_generator_name(
\arguments{
\item{mask_gen_coalitions}{Matrix (default is \code{NULL}). Matrix containing the coalitions that the
\code{vaeac} model will be trained on, see \code{\link[=specified_masks_mask_generator]{specified_masks_mask_generator()}}. This parameter is used internally
-in \code{shapr} when we only consider a subset of coalitions/combinations, i.e., when
-\code{n_combinations} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+in \code{shapr} when we only consider a subset of coalitions, i.e., when
+\code{n_coalitions} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
when \code{group} is specified in \code{\link[=explain]{explain()}}.}
\item{mask_gen_coalitions_prob}{Numeric array (default is \code{NULL}). Array of length equal to the height
@@ -27,8 +27,22 @@ of \code{mask_gen_coalitions} containing the probabilities of sampling the corre
model can do arbitrary conditioning as all coalitions will be trained. \code{masking_ratio} will be overruled if
\code{mask_gen_coalitions} is specified.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
}
\value{
The function does not return anything.
diff --git a/man/vaeac_get_x_explain_extended.Rd b/man/vaeac_get_x_explain_extended.Rd
index 91b76a56b..7f9bb1a10 100644
--- a/man/vaeac_get_x_explain_extended.Rd
+++ b/man/vaeac_get_x_explain_extended.Rd
@@ -12,8 +12,8 @@ Contains the the features, whose predictions ought to be explained.}
\item{S}{The \code{internal$objects$S} matrix containing the possible coalitions.}
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
+\item{index_features}{Positive integer vector. Specifies the id_coalition to
+apply to the present method. \code{NULL} means all coalitions. Only used internally.}
}
\value{
The extended version of \code{x_explain} where the masks from \code{S} with indices \code{index_features} have been applied.
diff --git a/man/vaeac_impute_missing_entries.Rd b/man/vaeac_impute_missing_entries.Rd
index a3dda74f4..2cb567060 100644
--- a/man/vaeac_impute_missing_entries.Rd
+++ b/man/vaeac_impute_missing_entries.Rd
@@ -6,12 +6,12 @@
\usage{
vaeac_impute_missing_entries(
x_explain_with_NaNs,
- n_samples,
+ n_MC_samples,
vaeac_model,
checkpoint,
sampler,
batch_size,
- verbose = 0,
+ verbose = NULL,
seed = NULL,
n_explain = NULL,
index_features = NULL
@@ -20,7 +20,7 @@ vaeac_impute_missing_entries(
\arguments{
\item{x_explain_with_NaNs}{A 2D matrix, where the missing entries to impute are represented by \code{NaN}.}
-\item{n_samples}{Integer. The number of imputed versions we create for each row in \code{x_explain_with_NaNs}.}
+\item{n_MC_samples}{Integer. The number of imputed versions we create for each row in \code{x_explain_with_NaNs}.}
\item{vaeac_model}{An initialized \code{vaeac} model that we are going to use to generate the MC samples.}
@@ -31,8 +31,22 @@ vaeac_impute_missing_entries(
\item{batch_size}{Positive integer (default is \code{64}). The number of samples to include in each batch
during the training of the vaeac model. Used in \code{\link[torch:dataloader]{torch::dataloader()}}.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{seed}{Positive integer (default is \code{1}). Seed for reproducibility. Specifies the seed before any randomness
based code is being run.}
@@ -42,7 +56,8 @@ based code is being run.}
\item{index_features}{Optional integer vector. Used internally in shapr package to index the coalitions.}
}
\value{
-A data.table where the missing values (\code{NaN}) in \code{x_explain_with_NaNs} have been imputed \code{n_samples} times.
+A data.table where the missing values (\code{NaN}) in \code{x_explain_with_NaNs} have been imputed \code{n_MC_samples}
+times.
The data table will contain extra id columns if \code{index_features} and \code{n_explain} are provided.
}
\description{
diff --git a/man/vaeac_plot_eval_crit.Rd b/man/vaeac_plot_eval_crit.Rd
index c94895d1b..c7086b853 100644
--- a/man/vaeac_plot_eval_crit.Rd
+++ b/man/vaeac_plot_eval_crit.Rd
@@ -80,7 +80,7 @@ explanation_paired <- explain(
x_train = x_train,
approach = approach,
prediction_zero = p0,
- n_samples = 1, # As we are only interested in the training of the vaeac
+ n_MC_samples = 1, # As we are only interested in the training of the vaeac
vaeac.epochs = 10, # Should be higher in applications.
vaeac.n_vaeacs_initialize = 1,
vaeac.width = 16,
@@ -94,7 +94,7 @@ explanation_regular <- explain(
x_train = x_train,
approach = approach,
prediction_zero = p0,
- n_samples = 1, # As we are only interested in the training of the vaeac
+ n_MC_samples = 1, # As we are only interested in the training of the vaeac
vaeac.epochs = 10, # Should be higher in applications.
vaeac.width = 16,
vaeac.depth = 2,
diff --git a/man/vaeac_plot_imputed_ggpairs.Rd b/man/vaeac_plot_imputed_ggpairs.Rd
index b667281f6..f95abe5b9 100644
--- a/man/vaeac_plot_imputed_ggpairs.Rd
+++ b/man/vaeac_plot_imputed_ggpairs.Rd
@@ -109,7 +109,7 @@ explanation <- explain(
x_train = x_train,
approach = "vaeac",
prediction_zero = mean(y_train),
- n_samples = 1,
+ n_MC_samples = 1,
vaeac.epochs = 10,
vaeac.n_vaeacs_initialize = 1
)
diff --git a/man/vaeac_prep_message_batch.Rd b/man/vaeac_prep_message_batch.Rd
deleted file mode 100644
index 7dd4d773a..000000000
--- a/man/vaeac_prep_message_batch.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/approach_vaeac.R
-\name{vaeac_prep_message_batch}
-\alias{vaeac_prep_message_batch}
-\title{Produce message about which batch prepare_data is working on}
-\usage{
-vaeac_prep_message_batch(internal, index_features)
-}
-\arguments{
-\item{internal}{List.
-Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}}
-The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.}
-
-\item{index_features}{Positive integer vector. Specifies the indices of combinations to
-apply to the present method. \code{NULL} means all combinations. Only used internally.}
-}
-\description{
-Produce message about which batch prepare_data is working on
-}
-\author{
-Lars Henry Berge Olsen
-}
-\keyword{internal}
diff --git a/man/vaeac_train_model.Rd b/man/vaeac_train_model.Rd
index f21fbb6f8..cfa8e88b6 100644
--- a/man/vaeac_train_model.Rd
+++ b/man/vaeac_train_model.Rd
@@ -130,8 +130,8 @@ model can do arbitrary conditioning as all coalitions will be trained. \code{mas
\item{mask_gen_coalitions}{Matrix (default is \code{NULL}). Matrix containing the coalitions that the
\code{vaeac} model will be trained on, see \code{\link[=specified_masks_mask_generator]{specified_masks_mask_generator()}}. This parameter is used internally
-in \code{shapr} when we only consider a subset of coalitions/combinations, i.e., when
-\code{n_combinations} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
+in \code{shapr} when we only consider a subset of coalitions, i.e., when
+\code{n_coalitions} \eqn{< 2^{n_{\text{features}}}}, and for group Shapley, i.e.,
when \code{group} is specified in \code{\link[=explain]{explain()}}.}
\item{mask_gen_coalitions_prob}{Numeric array (default is \code{NULL}). Array of length equal to the height
@@ -163,8 +163,22 @@ Note that additional choices are available if \code{vaeac.save_every_nth_epoch}
\code{vaeac.save_every_nth_epoch = 5}, then \code{vaeac.which_vaeac_model} can also take the values \code{"epoch_5"}, \code{"epoch_10"},
\code{"epoch_15"}, and so on.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{seed}{Positive integer (default is \code{1}). Seed for reproducibility. Specifies the seed before any randomness
based code is being run.}
diff --git a/man/vaeac_train_model_auxiliary.Rd b/man/vaeac_train_model_auxiliary.Rd
index 8aec55154..889b5d5d5 100644
--- a/man/vaeac_train_model_auxiliary.Rd
+++ b/man/vaeac_train_model_auxiliary.Rd
@@ -43,8 +43,22 @@ to compute the IWAE criterion when validating the vaeac model on the validation
The number of previous IWAE values to include
when we compute the running means of the IWAE criterion.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{cuda}{Logical (default is \code{FALSE}). If \code{TRUE}, then the \code{vaeac} model will be trained using cuda/GPU.
If \code{\link[torch:cuda_is_available]{torch::cuda_is_available()}} is \code{FALSE}, the we fall back to use CPU. If \code{FALSE}, we use the CPU. Using a GPU
diff --git a/man/vaeac_train_model_continue.Rd b/man/vaeac_train_model_continue.Rd
index 36b946e73..645025ea0 100644
--- a/man/vaeac_train_model_continue.Rd
+++ b/man/vaeac_train_model_continue.Rd
@@ -10,7 +10,7 @@ vaeac_train_model_continue(
lr_new = NULL,
x_train = NULL,
save_data = FALSE,
- verbose = 0,
+ verbose = NULL,
seed = 1
)
}
@@ -26,8 +26,22 @@ vaeac_train_model_continue(
\item{save_data}{Logical (default is \code{FALSE}). If \code{TRUE}, then the data is stored together with
the model. Useful if one are to continue to train the model later using \code{\link[=vaeac_train_model_continue]{vaeac_train_model_continue()}}.}
-\item{verbose}{Boolean. An integer specifying the level of verbosity. Use \code{0} (default) for no verbosity,
-\code{1} for low verbose, and \code{2} for high verbose.}
+\item{verbose}{String vector or NULL.
+Specifies the verbosity (printout detail level) through one or more of strings \code{"basic"}, \code{"progress"},
+\code{"convergence"}, \code{"shapley"} and \code{"vS_details"}.
+\code{"basic"} (default) displays basic information about the computation which is being performed.
+\verb{"progress} displays information about where in the calculation process the function currently is.
+#' \code{"convergence"} displays information on how close to convergence the Shapley value estimates are
+(only when \code{adaptive = TRUE}) .
+\code{"shapley"} displays intermediate Shapley value estimates and standard deviations (only when \code{adaptive = TRUE})
+\itemize{
+\item the final estimates.
+\code{"vS_details"} displays information about the v_S estimates.
+This is most relevant for \verb{approach \%in\% c("regression_separate", "regression_surrogate", "vaeac"}).
+\code{NULL} means no printout.
+Note that any combination of four strings can be used.
+E.g. \code{verbose = c("basic", "vS_details")} will display basic information + details about the vS estimation process.
+}}
\item{seed}{Positive integer (default is \code{1}). Seed for reproducibility. Specifies the seed before any randomness
based code is being run.}
diff --git a/man/weight_matrix.Rd b/man/weight_matrix.Rd
index 734160661..043a46c4a 100644
--- a/man/weight_matrix.Rd
+++ b/man/weight_matrix.Rd
@@ -1,19 +1,17 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/setup_computation.R
+% Please edit documentation in R/shapley_setup.R
\name{weight_matrix}
\alias{weight_matrix}
\title{Calculate weighted matrix}
\usage{
-weight_matrix(X, normalize_W_weights = TRUE, is_groupwise = FALSE)
+weight_matrix(X, normalize_W_weights = TRUE)
}
\arguments{
\item{X}{data.table}
-\item{normalize_W_weights}{Logical. Whether to normalize the weights for the combinations to sum to 1 for
-increased numerical stability before solving the WLS (weighted least squares). Applies to all combinations
-except combination \code{1} and \code{2^m}.}
-
-\item{is_groupwise}{Logical. Indicating whether group wise Shapley values are to be computed.}
+\item{normalize_W_weights}{Logical. Whether to normalize the weights for the coalitions to sum to 1 for
+increased numerical stability before solving the WLS (weighted least squares). Applies to all coalitions
+except coalition \code{1} and \code{2^m}.}
}
\value{
Numeric matrix. See \code{\link[=weight_matrix_cpp]{weight_matrix_cpp()}} for more information.
diff --git a/man/weight_matrix_cpp.Rd b/man/weight_matrix_cpp.Rd
index 054764afe..0a6505b9f 100644
--- a/man/weight_matrix_cpp.Rd
+++ b/man/weight_matrix_cpp.Rd
@@ -4,10 +4,10 @@
\alias{weight_matrix_cpp}
\title{Calculate weight matrix}
\usage{
-weight_matrix_cpp(subsets, m, n, w)
+weight_matrix_cpp(coalitions, m, n, w)
}
\arguments{
-\item{subsets}{List. Each of the elements equals an integer
+\item{coalitions}{List. Each of the elements equals an integer
vector representing a valid combination of features/feature groups.}
\item{m}{Integer. Number of features/feature groups}
@@ -16,7 +16,7 @@ vector representing a valid combination of features/feature groups.}
\item{w}{Numeric vector of length \code{n}, i.e. \code{w[i]} equals
the Shapley weight of feature/feature group combination \code{i}, represented by
-\code{subsets[[i]]}.}
+\code{coalitions[[i]]}.}
}
\value{
Matrix of dimension n x m + 1
@@ -25,6 +25,6 @@ Matrix of dimension n x m + 1
Calculate weight matrix
}
\author{
-Nikolai Sellereite
+Nikolai Sellereite, Martin Jullum
}
\keyword{internal}
diff --git a/src/Copula.cpp b/src/Copula.cpp
index 732ed3a4f..3b9d92248 100644
--- a/src/Copula.cpp
+++ b/src/Copula.cpp
@@ -54,8 +54,8 @@ arma::vec quantile_type7_cpp(const arma::vec& x, const arma::vec& probs) {
// [[Rcpp::export]]
arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x) {
int n_features = z.n_cols;
- int n_samples = z.n_rows;
- arma::mat z_new(n_samples, n_features);
+ int n_MC_samples = z.n_rows;
+ arma::mat z_new(n_MC_samples, n_features);
arma::mat u = arma::normcdf(z);
for (int feature_idx = 0; feature_idx < n_features; feature_idx++) {
z_new.col(feature_idx) = quantile_type7_cpp(x.col(feature_idx), u.col(feature_idx));
@@ -65,7 +65,7 @@ arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x) {
//' Generate (Gaussian) Copula MC samples
//'
-//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_MC_samples`, `n_features`) containing samples from the
//' univariate standard normal.
//' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
//' to explain on the original scale.
@@ -73,7 +73,7 @@ arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x) {
//' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
//' transformed to a standardized normal distribution.
//' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
-//' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+//' @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
//' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
//' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
//' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
@@ -82,8 +82,8 @@ arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x) {
//' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been
//' transformed to a standardized normal distribution.
//'
-//' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
-//' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+//' @return An arma::cube/3D array of dimension (`n_MC_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+//' the columns (_,j,_) are matrices of dimension (`n_MC_samples`, `n_features`) containing the conditional Gaussian
//' copula MC samples for each explicand and coalition on the original scale.
//'
//' @export
@@ -99,13 +99,13 @@ arma::cube prepare_data_copula_cpp(const arma::mat& MC_samples_mat,
const arma::mat& cov_mat) {
int n_explain = x_explain_mat.n_rows;
- int n_samples = MC_samples_mat.n_rows;
+ int n_MC_samples = MC_samples_mat.n_rows;
int n_features = MC_samples_mat.n_cols;
int n_coalitions = S.n_rows;
// Initialize auxiliary matrix and result cube
- arma::mat aux_mat(n_samples, n_features);
- arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features);
+ arma::mat aux_mat(n_MC_samples, n_features);
+ arma::cube result_cube(n_MC_samples, n_explain*n_coalitions, n_features);
// Iterate over the coalitions
for (int S_ind = 0; S_ind < n_coalitions; S_ind++) {
@@ -150,7 +150,7 @@ arma::cube prepare_data_copula_cpp(const arma::mat& MC_samples_mat,
// Transform the MC samples to be from N(mu_{Sbar|S}, Sigma_{Sbar|S}) for one coalition and one explicand
arma::mat MC_samples_mat_now_now =
- MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_samples, 1);
+ MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_MC_samples, 1);
// Transform the MC to the original scale using the inverse Gaussian transform
arma::mat MC_samples_mat_now_now_trans =
@@ -158,7 +158,7 @@ arma::cube prepare_data_copula_cpp(const arma::mat& MC_samples_mat,
// Insert the generate Gaussian copula MC samples and the feature values we condition on into an auxiliary matrix
aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans;
- aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1);
+ aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_MC_samples, 1);
// Insert the auxiliary matrix into the result cube
result_cube.col(S_ind*n_explain + idx_now) = aux_mat;
diff --git a/src/Gaussian.cpp b/src/Gaussian.cpp
index c375ed510..91a875317 100644
--- a/src/Gaussian.cpp
+++ b/src/Gaussian.cpp
@@ -5,19 +5,19 @@ using namespace Rcpp;
//' Generate Gaussian MC samples
//'
-//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_MC_samples`, `n_features`) containing samples from the
//' univariate standard normal.
//' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
//' to explain.
-//' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+//' @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
//' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
//' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
//' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature.
//' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
//' between all pairs of features.
//'
-//' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
-//' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+//' @return An arma::cube/3D array of dimension (`n_MC_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+//' the columns (_,j,_) are matrices of dimension (`n_MC_samples`, `n_features`) containing the conditional Gaussian
//' MC samples for each explicand and coalition.
//'
//' @export
@@ -31,13 +31,13 @@ arma::cube prepare_data_gaussian_cpp(const arma::mat& MC_samples_mat,
const arma::mat& cov_mat) {
int n_explain = x_explain_mat.n_rows;
- int n_samples = MC_samples_mat.n_rows;
+ int n_MC_samples = MC_samples_mat.n_rows;
int n_features = MC_samples_mat.n_cols;
int n_coalitions = S.n_rows;
// Initialize auxiliary matrix and result cube
- arma::mat aux_mat(n_samples, n_features);
- arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features);
+ arma::mat aux_mat(n_MC_samples, n_features);
+ arma::cube result_cube(n_MC_samples, n_explain*n_coalitions, n_features);
// Iterate over the coalitions
for (int S_ind = 0; S_ind < n_coalitions; S_ind++) {
@@ -78,8 +78,8 @@ arma::cube prepare_data_gaussian_cpp(const arma::mat& MC_samples_mat,
// Loop over the different explicands and combine the generated values with the values we conditioned on
for (int idx_now = 0; idx_now < n_explain; idx_now++) {
- aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1);
- aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1);
+ aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_MC_samples, 1);
+ aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_MC_samples, 1);
result_cube.col(S_ind*n_explain + idx_now) = aux_mat;
}
}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index c95d55541..cde3e3a84 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -179,28 +179,28 @@ BEGIN_RCPP
END_RCPP
}
// weight_matrix_cpp
-arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w);
-RcppExport SEXP _shapr_weight_matrix_cpp(SEXP subsetsSEXP, SEXP mSEXP, SEXP nSEXP, SEXP wSEXP) {
+arma::mat weight_matrix_cpp(List coalitions, int m, int n, NumericVector w);
+RcppExport SEXP _shapr_weight_matrix_cpp(SEXP coalitionsSEXP, SEXP mSEXP, SEXP nSEXP, SEXP wSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
- Rcpp::traits::input_parameter< List >::type subsets(subsetsSEXP);
+ Rcpp::traits::input_parameter< List >::type coalitions(coalitionsSEXP);
Rcpp::traits::input_parameter< int >::type m(mSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
Rcpp::traits::input_parameter< NumericVector >::type w(wSEXP);
- rcpp_result_gen = Rcpp::wrap(weight_matrix_cpp(subsets, m, n, w));
+ rcpp_result_gen = Rcpp::wrap(weight_matrix_cpp(coalitions, m, n, w));
return rcpp_result_gen;
END_RCPP
}
-// feature_matrix_cpp
-NumericMatrix feature_matrix_cpp(List features, int m);
-RcppExport SEXP _shapr_feature_matrix_cpp(SEXP featuresSEXP, SEXP mSEXP) {
+// coalition_matrix_cpp
+NumericMatrix coalition_matrix_cpp(List coalitions, int m);
+RcppExport SEXP _shapr_coalition_matrix_cpp(SEXP coalitionsSEXP, SEXP mSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
- Rcpp::traits::input_parameter< List >::type features(featuresSEXP);
+ Rcpp::traits::input_parameter< List >::type coalitions(coalitionsSEXP);
Rcpp::traits::input_parameter< int >::type m(mSEXP);
- rcpp_result_gen = Rcpp::wrap(feature_matrix_cpp(features, m));
+ rcpp_result_gen = Rcpp::wrap(coalition_matrix_cpp(coalitions, m));
return rcpp_result_gen;
END_RCPP
}
@@ -219,7 +219,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_shapr_sample_features_cpp", (DL_FUNC) &_shapr_sample_features_cpp, 2},
{"_shapr_observation_impute_cpp", (DL_FUNC) &_shapr_observation_impute_cpp, 5},
{"_shapr_weight_matrix_cpp", (DL_FUNC) &_shapr_weight_matrix_cpp, 4},
- {"_shapr_feature_matrix_cpp", (DL_FUNC) &_shapr_feature_matrix_cpp, 2},
+ {"_shapr_coalition_matrix_cpp", (DL_FUNC) &_shapr_coalition_matrix_cpp, 2},
{NULL, NULL, 0}
};
diff --git a/src/impute_data.cpp b/src/impute_data.cpp
index cced8fa51..2c6f4d4da 100644
--- a/src/impute_data.cpp
+++ b/src/impute_data.cpp
@@ -13,7 +13,7 @@ using namespace Rcpp;
//'
//' @param xtest Numeric matrix. Represents a single test observation.
//'
-//' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
+//' @param S Integer matrix of dimension \code{n_coalitions x m}, where \code{n_coalitions} equals
//' the total number of sampled/non-sampled feature combinations and \code{m} equals
//' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
//' for more information.
diff --git a/src/weighted_matrix.cpp b/src/weighted_matrix.cpp
index 8b71520ad..79eaa8762 100644
--- a/src/weighted_matrix.cpp
+++ b/src/weighted_matrix.cpp
@@ -1,29 +1,32 @@
+#define ARMA_WARN_LEVEL 1 // Disables the warning regarding approximate solution for small n_coalitions
#include
using namespace Rcpp;
+
+
//' Calculate weight matrix
//'
-//' @param subsets List. Each of the elements equals an integer
+//' @param coalitions List. Each of the elements equals an integer
//' vector representing a valid combination of features/feature groups.
//' @param m Integer. Number of features/feature groups
//' @param n Integer. Number of combinations
//' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals
//' the Shapley weight of feature/feature group combination \code{i}, represented by
-//' \code{subsets[[i]]}.
+//' \code{coalitions[[i]]}.
//'
//' @export
//' @keywords internal
//'
//' @return Matrix of dimension n x m + 1
-//' @author Nikolai Sellereite
+//' @author Nikolai Sellereite, Martin Jullum
// [[Rcpp::export]]
-arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w){
+arma::mat weight_matrix_cpp(List coalitions, int m, int n, NumericVector w){
// Note that Z is a n x (m + 1) matrix, where m is the number
- // of unique subsets. All elements in the first column are equal to 1.
+ // of unique coalitions. All elements in the first column are equal to 1.
// For j > 0, Z(i, j) = 1 if and only if feature/feature group j is present in
- // the ith combination of subsets. In example, if Z(i, j) = 1 we know that
- // j is present in subsets[i].
+ // the ith combination of coalitions. In example, if Z(i, j) = 1 we know that
+ // j is present in coalitions[i].
// Note that w represents the diagonal in W, where W is a diagoanl
// n x n matrix.
@@ -51,8 +54,8 @@ arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w){
// Set all elements in the first column equal to 1
Z(i, 0) = 1;
- // Extract subsets
- subset_vec = subsets[i];
+ // Extract coalitions
+ subset_vec = coalitions[i];
n_elements = subset_vec.length();
if (n_elements > 0) {
for (int j = 0; j < n_elements; j++)
@@ -74,32 +77,32 @@ arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w){
return R;
}
-//' Get feature matrix
+//' Get coalition matrix
//'
-//' @param features List
-//' @param m Positive integer. Total number of features
+//' @param coalitions List
+//' @param m Positive integer. Total number of coalitions
//'
//' @export
//' @keywords internal
//'
//' @return Matrix
-//' @author Nikolai Sellereite
+//' @author Nikolai Sellereite, Martin Jullum
// [[Rcpp::export]]
-NumericMatrix feature_matrix_cpp(List features, int m) {
+NumericMatrix coalition_matrix_cpp(List coalitions, int m) {
// Define variables
- int n_combinations;
- n_combinations = features.length();
- NumericMatrix A(n_combinations, m);
+ int n_coalitions;
+ n_coalitions = coalitions.length();
+ NumericMatrix A(n_coalitions, m);
// Error-check
- IntegerVector features_zero = features[0];
+ IntegerVector features_zero = coalitions[0];
if (features_zero.length() > 0)
- Rcpp::stop("The first element of features should be an empty vector, i.e. integer(0)");
+ Rcpp::stop("Internal error: The first element of coalitions should be an empty vector, i.e. integer(0)");
- for (int i = 1; i < n_combinations; ++i) {
+ for (int i = 1; i < n_coalitions; ++i) {
- NumericVector feature_vec = features[i];
+ NumericVector feature_vec = coalitions[i];
for (int j = 0; j < feature_vec.length(); ++j) {
diff --git a/tests/testthat/test-forecast-output.R b/tests/testthat/0test-forecast-output.R
similarity index 91%
rename from tests/testthat/test-forecast-output.R
rename to tests/testthat/0test-forecast-output.R
index c2bcc000b..6468ee743 100644
--- a/tests/testthat/test-forecast-output.R
+++ b/tests/testthat/0test-forecast-output.R
@@ -1,6 +1,7 @@
test_that("forecast_output_ar_numeric", {
expect_snapshot_rds(
explain_forecast(
+ testing = TRUE,
model = model_ar_temp,
y = data[, "Temp"],
train_idx = 2:151,
@@ -10,8 +11,7 @@ test_that("forecast_output_ar_numeric", {
approach = "empirical",
prediction_zero = p0_ar,
group_lags = FALSE,
- n_batches = 1,
- timing = FALSE
+ n_batches = 1
),
"forecast_output_ar_numeric"
)
@@ -20,6 +20,7 @@ test_that("forecast_output_ar_numeric", {
test_that("forecast_output_arima_numeric", {
expect_snapshot_rds(
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -31,8 +32,7 @@ test_that("forecast_output_arima_numeric", {
approach = "empirical",
prediction_zero = p0_ar,
group_lags = FALSE,
- n_batches = 1,
- timing = FALSE
+ n_batches = 1
),
"forecast_output_arima_numeric"
)
@@ -41,6 +41,7 @@ test_that("forecast_output_arima_numeric", {
test_that("forecast_output_arima_numeric_no_xreg", {
expect_snapshot_rds(
explain_forecast(
+ testing = TRUE,
model = model_arima_temp_noxreg,
y = data[1:150, "Temp"],
train_idx = 2:148,
@@ -50,8 +51,7 @@ test_that("forecast_output_arima_numeric_no_xreg", {
approach = "empirical",
prediction_zero = p0_ar,
group_lags = FALSE,
- n_batches = 1,
- timing = FALSE
+ n_batches = 1
),
"forecast_output_arima_numeric_no_xreg"
)
@@ -60,6 +60,7 @@ test_that("forecast_output_arima_numeric_no_xreg", {
test_that("forecast_output_forecast_ARIMA_group_numeric", {
expect_snapshot_rds(
explain_forecast(
+ testing = TRUE,
model = model_forecast_ARIMA_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -71,8 +72,7 @@ test_that("forecast_output_forecast_ARIMA_group_numeric", {
approach = "empirical",
prediction_zero = p0_ar,
group_lags = TRUE,
- n_batches = 1,
- timing = FALSE
+ n_batches = 1
),
"forecast_output_forecast_ARIMA_group_numeric"
)
@@ -81,6 +81,7 @@ test_that("forecast_output_forecast_ARIMA_group_numeric", {
test_that("ARIMA gives the same output with different horizons", {
h3 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -93,11 +94,12 @@ test_that("ARIMA gives the same output with different horizons", {
prediction_zero = p0_ar[1:3],
group_lags = FALSE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
h2 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -110,10 +112,11 @@ test_that("ARIMA gives the same output with different horizons", {
prediction_zero = p0_ar[1:2],
group_lags = FALSE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
h1 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -126,7 +129,7 @@ test_that("ARIMA gives the same output with different horizons", {
prediction_zero = p0_ar[1],
group_lags = FALSE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
cols_horizon1 <- h2$internal$objects$cols_per_horizon[[1]]
@@ -149,6 +152,7 @@ test_that("ARIMA gives the same output with different horizons", {
test_that("ARIMA gives the same output with different horizons with grouping", {
h3 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -161,11 +165,12 @@ test_that("ARIMA gives the same output with different horizons with grouping", {
prediction_zero = p0_ar[1:3],
group_lags = TRUE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
h2 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -178,10 +183,11 @@ test_that("ARIMA gives the same output with different horizons with grouping", {
prediction_zero = p0_ar[1:2],
group_lags = TRUE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
h1 <- explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -194,7 +200,7 @@ test_that("ARIMA gives the same output with different horizons with grouping", {
prediction_zero = p0_ar[1],
group_lags = TRUE,
n_batches = 1,
- timing = FALSE, n_combinations = 50
+ max_n_coalitions = 50
)
expect_equal(
@@ -217,6 +223,7 @@ test_that("forecast_output_arima_numeric_no_lags", {
# TODO: Need to check out this output. It gives lots of warnings, which indicates something might be wrong.
expect_snapshot_rds(
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -228,8 +235,7 @@ test_that("forecast_output_arima_numeric_no_lags", {
approach = "independence",
prediction_zero = p0_ar,
group_lags = FALSE,
- n_batches = 1,
- timing = FALSE
+ n_batches = 1
),
"forecast_output_arima_numeric_no_lags"
)
diff --git a/tests/testthat/test-forecast-setup.R b/tests/testthat/0test-forecast-setup.R
similarity index 86%
rename from tests/testthat/test-forecast-setup.R
rename to tests/testthat/0test-forecast-setup.R
index 70a49eafb..552c614a2 100644
--- a/tests/testthat/test-forecast-setup.R
+++ b/tests/testthat/0test-forecast-setup.R
@@ -9,6 +9,7 @@ test_that("error with custom model without providing predict_model", {
class(model_custom_arima_temp) <- "whatever"
explain_forecast(
+ testing = TRUE,
model = model_custom_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -36,6 +37,7 @@ test_that("erroneous input: `x_train/x_explain`", {
y_wrong_format <- data[, c("Temp", "Wind")]
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = y_wrong_format,
xreg = data[, "Wind"],
@@ -58,6 +60,7 @@ test_that("erroneous input: `x_train/x_explain`", {
xreg_wrong_format <- data[, c("Temp", "Wind")]
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = xreg_wrong_format,
@@ -81,6 +84,7 @@ test_that("erroneous input: `x_train/x_explain`", {
names(xreg_no_column_names) <- NULL
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = xreg_no_column_names,
@@ -105,6 +109,7 @@ test_that("erroneous input: `model`", {
{
# no model passed
explain_forecast(
+ testing = TRUE,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
train_idx = 2:148,
@@ -131,6 +136,7 @@ test_that("erroneous input: `prediction_zero`", {
p0_wrong_length <- p0_ar[1:2]
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -148,65 +154,61 @@ test_that("erroneous input: `prediction_zero`", {
)
})
-test_that("erroneous input: `n_combinations`", {
+test_that("erroneous input: `max_n_coalitions`", {
set.seed(123)
- expect_snapshot(
- {
- # Too low n_combinations (smaller than # features)
- horizon <- 3
- explain_y_lags <- 2
- explain_xreg_lags <- 2
-
- n_combinations <- horizon + explain_y_lags + explain_xreg_lags - 1
-
- explain_forecast(
- model = model_arima_temp,
- y = data[1:150, "Temp"],
- xreg = data[, "Wind"],
- train_idx = 2:148,
- explain_idx = 149:150,
- explain_y_lags = explain_y_lags,
- explain_xreg_lags = explain_xreg_lags,
- horizon = horizon,
- approach = "independence",
- prediction_zero = p0_ar,
- n_batches = 1,
- n_combinations = n_combinations,
- group_lags = FALSE
- )
- },
- error = TRUE
- )
-
-
- expect_snapshot(
- {
- # Too low n_combinations (smaller than # groups)
- horizon <- 3
- explain_y_lags <- 2
- explain_xreg_lags <- 2
-
- n_combinations <- 1 + 1
-
- explain_forecast(
- model = model_arima_temp,
- y = data[1:150, "Temp"],
- xreg = data[, "Wind"],
- train_idx = 2:148,
- explain_idx = 149:150,
- explain_y_lags = explain_y_lags,
- explain_xreg_lags = explain_xreg_lags,
- horizon = horizon,
- approach = "independence",
- prediction_zero = p0_ar,
- n_batches = 1,
- n_combinations = n_combinations,
- group_lags = TRUE
- )
- },
- error = TRUE
- )
+ expect_snapshot({
+ # Too low max_n_coalitions (smaller than # features)
+ horizon <- 3
+ explain_y_lags <- 2
+ explain_xreg_lags <- 2
+
+ n_coalitions <- horizon + explain_y_lags + explain_xreg_lags - 1
+
+ explain_forecast(
+ testing = TRUE,
+ model = model_arima_temp,
+ y = data[1:150, "Temp"],
+ xreg = data[, "Wind"],
+ train_idx = 2:148,
+ explain_idx = 149:150,
+ explain_y_lags = explain_y_lags,
+ explain_xreg_lags = explain_xreg_lags,
+ horizon = horizon,
+ approach = "independence",
+ prediction_zero = p0_ar,
+ n_batches = 1,
+ max_n_coalitions = n_coalitions,
+ group_lags = FALSE
+ )
+ })
+
+
+ expect_snapshot({
+ # Too low n_coalitions (smaller than # groups)
+ horizon <- 3
+ explain_y_lags <- 2
+ explain_xreg_lags <- 2
+
+ n_coalitions <- 1 + 1
+
+ explain_forecast(
+ testing = TRUE,
+ model = model_arima_temp,
+ y = data[1:150, "Temp"],
+ xreg = data[, "Wind"],
+ train_idx = 2:148,
+ explain_idx = 149:150,
+ explain_y_lags = explain_y_lags,
+ explain_xreg_lags = explain_xreg_lags,
+ horizon = horizon,
+ approach = "independence",
+ prediction_zero = p0_ar,
+ n_batches = 1,
+ max_n_coalitions = n_coalitions,
+ group_lags = TRUE
+ )
+ })
})
@@ -219,6 +221,7 @@ test_that("erroneous input: `train_idx`", {
train_idx_too_short <- 2
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -242,6 +245,7 @@ test_that("erroneous input: `train_idx`", {
train_idx_not_integer <- c(3:5) + 0.1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -264,6 +268,7 @@ test_that("erroneous input: `train_idx`", {
train_idx_out_of_range <- 1:5
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -290,6 +295,7 @@ test_that("erroneous input: `explain_idx`", {
explain_idx_not_integer <- c(3:5) + 0.1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -312,6 +318,7 @@ test_that("erroneous input: `explain_idx`", {
explain_idx_out_of_range <- 1:5
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -338,6 +345,7 @@ test_that("erroneous input: `explain_y_lags`", {
explain_y_lags_negative <- -1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -360,6 +368,7 @@ test_that("erroneous input: `explain_y_lags`", {
explain_y_lags_not_integer <- 2.1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -382,6 +391,7 @@ test_that("erroneous input: `explain_y_lags`", {
explain_y_lags_more_than_one <- c(1, 2)
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -405,6 +415,7 @@ test_that("erroneous input: `explain_y_lags`", {
explain_y_lags_zero <- 0
explain_forecast(
+ testing = TRUE,
model = model_arima_temp_noxreg,
y = data[1:150, "Temp"],
train_idx = 2:148,
@@ -430,6 +441,7 @@ test_that("erroneous input: `explain_x_lags`", {
explain_xreg_lags_negative <- -2
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -452,6 +464,7 @@ test_that("erroneous input: `explain_x_lags`", {
explain_xreg_lags_not_integer <- 2.1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -474,6 +487,7 @@ test_that("erroneous input: `explain_x_lags`", {
explain_x_lags_wrong_length <- c(1, 2) # only 1 xreg variable defined
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -500,6 +514,7 @@ test_that("erroneous input: `horizon`", {
horizon_negative <- -2
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
@@ -522,6 +537,7 @@ test_that("erroneous input: `horizon`", {
horizon_not_integer <- 2.1
explain_forecast(
+ testing = TRUE,
model = model_arima_temp,
y = data[1:150, "Temp"],
xreg = data[, "Wind"],
diff --git a/tests/testthat/_snaps/adaptive-output.md b/tests/testthat/_snaps/adaptive-output.md
new file mode 100644
index 000000000..7e483da59
--- /dev/null
+++ b/tests/testthat/_snaps/adaptive-output.md
@@ -0,0 +1,984 @@
+# output_lm_numeric_independence_reach_exact
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ * Model class:
+ * Approach: independence
+ * Adaptive estimation: TRUE
+ * Number of feature-wise Shapley values: 5
+ * Number of observations to explain: 3
+
+ -- Adaptive computation started --
+
+ -- Iteration 1 -----------------------------------------------------------------
+ i Using 5 of 32 coalitions, 5 new.
+
+ -- Convergence info
+ i Not converged after 6 coalitions:
+ Current convergence measure: 0.31 [needs 0.02]
+ Estimated remaining coalitions: 24
+ (Concervatively) adding 10% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) 0.258 (2.14) 0.258 (2.14) 17.463 (5.62) -5.635 (1.84)
+ 2: 42.444 (0.00) -0.986 (0.56) -0.986 (0.56) -5.286 (1.40) -5.635 (1.45)
+ 3: 42.444 (0.00) -4.493 (0.33) -4.493 (0.33) -1.495 (0.98) -2.595 (0.59)
+ Day
+
+ 1: 0.258 (2.14)
+ 2: -0.986 (0.56)
+ 3: -4.493 (0.33)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+ i Using 10 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.18 [needs 0.02]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.411 (3.37) 8.305 (3.82) 17.463 (3.50) -5.635 (0.19)
+ 2: 42.444 (0.00) 2.376 (1.47) -3.309 (1.07) -5.286 (1.24) -5.635 (1.02)
+ 3: 42.444 (0.00) 3.834 (3.22) -18.574 (5.10) -1.495 (2.37) -2.595 (0.83)
+ Day
+
+ 1: -3.121 (3.24)
+ 2: -2.025 (1.13)
+ 3: 1.261 (4.44)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+ i Using 12 of 32 coalitions, 2 new.
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.079 [needs 0.02]
+ Estimated remaining coalitions: 18
+ (Concervatively) adding 20% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.467 (0.21) 8.284 (0.98) 17.485 (0.01) -5.635 (0.12)
+ 2: 42.444 (0.00) 2.320 (0.75) -3.331 (0.11) -5.264 (0.01) -5.635 (0.39)
+ 3: 42.444 (0.00) 3.778 (0.47) -18.596 (1.70) -1.473 (0.01) -2.595 (0.34)
+ Day
+
+ 1: -3.065 (1.02)
+ 2: -1.969 (0.67)
+ 3: 1.317 (1.77)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+ i Using 16 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ v Converged after 16 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.541 (0.05) 8.330 (0.80) 17.491 (0.02) -5.585 (0.02)
+ 2: 42.444 (0.00) 2.246 (0.05) -3.285 (0.10) -5.258 (0.02) -5.585 (0.02)
+ 3: 42.444 (0.00) 3.704 (0.05) -18.549 (1.40) -1.467 (0.02) -2.545 (0.02)
+ Day
+
+ 1: -3.093 (0.80)
+ 2: -1.997 (0.10)
+ 3: 1.289 (1.40)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.541 8.330 17.491 -5.585 -3.093
+ 2: 2 42.44 2.246 -3.285 -5.258 -5.585 -1.997
+ 3: 3 42.44 3.704 -18.549 -1.467 -2.545 1.289
+
+# output_lm_numeric_independence_converges_tol
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+
+ -- Iteration 1 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.19 [needs 0.1]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (2.23) 8.215 (3.14) 17.463 (5.65) -5.545 (3.30)
+ 2: 42.444 (0.00) 2.196 (1.45) -3.399 (0.45) -5.286 (1.14) -5.545 (1.04)
+ 3: 42.444 (0.00) 3.654 (0.94) -18.664 (4.32) -1.495 (1.14) -2.505 (3.75)
+ Day
+
+ 1: -2.940 (4.17)
+ 2: -1.845 (1.51)
+ 3: 1.442 (2.14)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.14 [needs 0.1]
+ Estimated remaining coalitions: 8
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (0.76) 8.215 (2.20) 17.463 (4.64) -5.545 (2.14)
+ 2: 42.444 (0.00) 2.196 (0.98) -3.399 (0.47) -5.286 (0.76) -5.545 (0.98)
+ 3: 42.444 (0.00) 3.654 (1.12) -18.664 (3.06) -1.495 (0.82) -2.505 (2.55)
+ Day
+
+ 1: -2.940 (4.54)
+ 2: -1.845 (1.11)
+ 3: 1.442 (1.96)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 14 coalitions:
+ Current convergence measure: 0.14 [needs 0.1]
+ Estimated remaining coalitions: 10
+ (Concervatively) adding 20% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.570 (0.87) 8.236 (1.92) 17.463 (4.97) -5.593 (1.32)
+ 2: 42.444 (0.00) 2.217 (0.66) -3.378 (0.33) -5.286 (0.86) -5.593 (0.26)
+ 3: 42.444 (0.00) 3.675 (0.52) -18.643 (3.19) -1.495 (0.72) -2.553 (1.19)
+ Day
+
+ 1: -2.934 (4.68)
+ 2: -1.839 (1.06)
+ 3: 1.448 (3.00)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+
+ -- Convergence info
+ v Converged after 16 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.538 (1.90) 8.268 (0.56) 17.523 (3.29) -5.589 (0.04)
+ 2: 42.444 (0.00) 2.249 (0.66) -3.347 (0.09) -5.227 (0.77) -5.589 (0.04)
+ 3: 42.444 (0.00) 3.707 (0.45) -18.611 (1.01) -1.435 (0.58) -2.549 (0.04)
+ Day
+
+ 1: -3.061 (2.86)
+ 2: -1.966 (0.50)
+ 3: 1.321 (1.06)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.538 8.268 17.523 -5.589 -3.061
+ 2: 2 42.44 2.249 -3.347 -5.227 -5.589 -1.966
+ 3: 3 42.44 3.707 -18.611 -1.435 -2.549 1.321
+
+# output_lm_numeric_independence_converges_maxit
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+
+ -- Iteration 1 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.19 [needs 0.001]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 0.001% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (2.23) 8.215 (3.14) 17.463 (5.65) -5.545 (3.30)
+ 2: 42.444 (0.00) 2.196 (1.45) -3.399 (0.45) -5.286 (1.14) -5.545 (1.04)
+ 3: 42.444 (0.00) 3.654 (0.94) -18.664 (4.32) -1.495 (1.14) -2.505 (3.75)
+ Day
+
+ 1: -2.940 (4.17)
+ 2: -1.845 (1.51)
+ 3: 1.442 (2.14)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.14 [needs 0.001]
+ Estimated remaining coalitions: 18
+ (Concervatively) adding 0.001% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (0.76) 8.215 (2.20) 17.463 (4.64) -5.545 (2.14)
+ 2: 42.444 (0.00) 2.196 (0.98) -3.399 (0.47) -5.286 (0.76) -5.545 (0.98)
+ 3: 42.444 (0.00) 3.654 (1.12) -18.664 (3.06) -1.495 (0.82) -2.505 (2.55)
+ Day
+
+ 1: -2.940 (4.54)
+ 2: -1.845 (1.11)
+ 3: 1.442 (1.96)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 14 coalitions:
+ Current convergence measure: 0.14 [needs 0.001]
+ Estimated remaining coalitions: 16
+ (Concervatively) adding 0.001% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.570 (0.87) 8.236 (1.92) 17.463 (4.97) -5.593 (1.32)
+ 2: 42.444 (0.00) 2.217 (0.66) -3.378 (0.33) -5.286 (0.86) -5.593 (0.26)
+ 3: 42.444 (0.00) 3.675 (0.52) -18.643 (3.19) -1.495 (0.72) -2.553 (1.19)
+ Day
+
+ 1: -2.934 (4.68)
+ 2: -1.839 (1.06)
+ 3: 1.448 (3.00)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 16 coalitions:
+ Current convergence measure: 0.099 [needs 0.001]
+ Estimated remaining coalitions: 14
+ (Concervatively) adding 0.001% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.538 (1.90) 8.268 (0.56) 17.523 (3.29) -5.589 (0.04)
+ 2: 42.444 (0.00) 2.249 (0.66) -3.347 (0.09) -5.227 (0.77) -5.589 (0.04)
+ 3: 42.444 (0.00) 3.707 (0.45) -18.611 (1.01) -1.435 (0.58) -2.549 (0.04)
+ Day
+
+ 1: -3.061 (2.86)
+ 2: -1.966 (0.50)
+ 3: 1.321 (1.06)
+ Message
+
+ -- Iteration 5 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 18 coalitions:
+ Current convergence measure: 0.06 [needs 0.001]
+ Estimated remaining coalitions: 12
+ (Concervatively) adding 0.001% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.536 (1.11) 8.270 (0.03) 17.519 (2.34) -5.592 (1.16)
+ 2: 42.444 (0.00) 2.251 (0.47) -3.344 (0.03) -5.231 (0.47) -5.592 (0.03)
+ 3: 42.444 (0.00) 3.709 (0.30) -18.609 (0.03) -1.439 (0.36) -2.552 (0.06)
+ Day
+
+ 1: -3.059 (1.77)
+ 2: -1.964 (0.42)
+ 3: 1.323 (0.30)
+ Message
+
+ -- Iteration 6 -----------------------------------------------------------------
+
+ -- Convergence info
+ v Converged after 20 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.534 (0.01) 8.272 (0.01) 17.520 (0.01) -5.592 (0.01)
+ 2: 42.444 (0.00) 2.253 (0.01) -3.342 (0.01) -5.229 (0.01) -5.592 (0.01)
+ 3: 42.444 (0.00) 3.711 (0.01) -18.607 (0.01) -1.438 (0.01) -2.553 (0.01)
+ Day
+
+ 1: -3.064 (0.01)
+ 2: -1.968 (0.01)
+ 3: 1.318 (0.01)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.534 8.272 17.520 -5.592 -3.064
+ 2: 2 42.44 2.253 -3.342 -5.229 -5.592 -1.968
+ 3: 3 42.44 3.711 -18.607 -1.438 -2.553 1.318
+
+# output_lm_numeric_indep_conv_max_n_coalitions
+
+ Code
+ (out <- code)
+ Message
+
+ -- Iteration 1 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 6 coalitions:
+ Current convergence measure: 0.31 [needs 0.02]
+ Estimated remaining coalitions: 24
+ (Concervatively) adding 10% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) 0.258 (2.14) 0.258 (2.14) 17.463 (5.62) -5.635 (1.84)
+ 2: 42.444 (0.00) -0.986 (0.56) -0.986 (0.56) -5.286 (1.40) -5.635 (1.45)
+ 3: 42.444 (0.00) -4.493 (0.33) -4.493 (0.33) -1.495 (0.98) -2.595 (0.59)
+ Day
+
+ 1: 0.258 (2.14)
+ 2: -0.986 (0.56)
+ 3: -4.493 (0.33)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.18 [needs 0.02]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.411 (3.37) 8.305 (3.82) 17.463 (3.50) -5.635 (0.19)
+ 2: 42.444 (0.00) 2.376 (1.47) -3.309 (1.07) -5.286 (1.24) -5.635 (1.02)
+ 3: 42.444 (0.00) 3.834 (3.22) -18.574 (5.10) -1.495 (2.37) -2.595 (0.83)
+ Day
+
+ 1: -3.121 (3.24)
+ 2: -2.025 (1.13)
+ 3: 1.261 (4.44)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.079 [needs 0.02]
+ Estimated remaining coalitions: 18
+ (Concervatively) adding 20% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.467 (0.21) 8.284 (0.98) 17.485 (0.01) -5.635 (0.12)
+ 2: 42.444 (0.00) 2.320 (0.75) -3.331 (0.11) -5.264 (0.01) -5.635 (0.39)
+ 3: 42.444 (0.00) 3.778 (0.47) -18.596 (1.70) -1.473 (0.01) -2.595 (0.34)
+ Day
+
+ 1: -3.065 (1.02)
+ 2: -1.969 (0.67)
+ 3: 1.317 (1.77)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+
+ -- Convergence info
+ v Converged after 16 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.541 (0.05) 8.330 (0.80) 17.491 (0.02) -5.585 (0.02)
+ 2: 42.444 (0.00) 2.246 (0.05) -3.285 (0.10) -5.258 (0.02) -5.585 (0.02)
+ 3: 42.444 (0.00) 3.704 (0.05) -18.549 (1.40) -1.467 (0.02) -2.545 (0.02)
+ Day
+
+ 1: -3.093 (0.80)
+ 2: -1.997 (0.10)
+ 3: 1.289 (1.40)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.541 8.330 17.491 -5.585 -3.093
+ 2: 2 42.44 2.246 -3.285 -5.258 -5.585 -1.997
+ 3: 3 42.44 3.704 -18.549 -1.467 -2.545 1.289
+
+# output_lm_numeric_gaussian_group_converges_tol
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_groups = 8,
+ and is therefore set to 2^n_groups = 8.
+
+
+ -- Iteration 1 -----------------------------------------------------------------
+
+ -- Convergence info
+ v Converged after 6 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none A B C
+
+ 1: 42.444 (0.00) 0.772 (2.66) 13.337 (3.49) -1.507 (3.31)
+ 2: 42.444 (0.00) 0.601 (2.97) -13.440 (3.32) -1.040 (2.77)
+ 3: 42.444 (0.00) -18.368 (3.91) 0.127 (3.95) 0.673 (0.12)
+ explain_id none A B C
+
+ 1: 1 42.44 0.7716 13.3373 -1.5069
+ 2: 2 42.44 0.6006 -13.4404 -1.0396
+ 3: 3 42.44 -18.3678 0.1268 0.6728
+
+# output_lm_numeric_independence_converges_tol_paired
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+
+ -- Iteration 1 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.19 [needs 0.1]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (2.23) 8.215 (3.14) 17.463 (5.65) -5.545 (3.30)
+ 2: 42.444 (0.00) 2.196 (1.45) -3.399 (0.45) -5.286 (1.14) -5.545 (1.04)
+ 3: 42.444 (0.00) 3.654 (0.94) -18.664 (4.32) -1.495 (1.14) -2.505 (3.75)
+ Day
+
+ 1: -2.940 (4.17)
+ 2: -1.845 (1.51)
+ 3: 1.442 (2.14)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.14 [needs 0.1]
+ Estimated remaining coalitions: 8
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.591 (0.76) 8.215 (2.20) 17.463 (4.64) -5.545 (2.14)
+ 2: 42.444 (0.00) 2.196 (0.98) -3.399 (0.47) -5.286 (0.76) -5.545 (0.98)
+ 3: 42.444 (0.00) 3.654 (1.12) -18.664 (3.06) -1.495 (0.82) -2.505 (2.55)
+ Day
+
+ 1: -2.940 (4.54)
+ 2: -1.845 (1.11)
+ 3: 1.442 (1.96)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+
+ -- Convergence info
+ i Not converged after 14 coalitions:
+ Current convergence measure: 0.14 [needs 0.1]
+ Estimated remaining coalitions: 10
+ (Concervatively) adding 20% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.570 (0.87) 8.236 (1.92) 17.463 (4.97) -5.593 (1.32)
+ 2: 42.444 (0.00) 2.217 (0.66) -3.378 (0.33) -5.286 (0.86) -5.593 (0.26)
+ 3: 42.444 (0.00) 3.675 (0.52) -18.643 (3.19) -1.495 (0.72) -2.553 (1.19)
+ Day
+
+ 1: -2.934 (4.68)
+ 2: -1.839 (1.06)
+ 3: 1.448 (3.00)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+
+ -- Convergence info
+ v Converged after 16 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -4.538 (1.90) 8.268 (0.56) 17.523 (3.29) -5.589 (0.04)
+ 2: 42.444 (0.00) 2.249 (0.66) -3.347 (0.09) -5.227 (0.77) -5.589 (0.04)
+ 3: 42.444 (0.00) 3.707 (0.45) -18.611 (1.01) -1.435 (0.58) -2.549 (0.04)
+ Day
+
+ 1: -3.061 (2.86)
+ 2: -1.966 (0.50)
+ 3: 1.321 (1.06)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.538 8.268 17.523 -5.589 -3.061
+ 2: 2 42.44 2.249 -3.347 -5.227 -5.589 -1.966
+ 3: 3 42.44 3.707 -18.611 -1.435 -2.549 1.321
+
+# output_lm_numeric_independence_saving_and_cont_est
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ Output
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.531 8.202 17.504 -5.549 -3.024
+ 2: 2 42.44 2.256 -3.412 -5.246 -5.549 -1.928
+ 3: 3 42.44 3.714 -18.677 -1.454 -2.509 1.358
+
+---
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ Output
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -4.531 8.202 17.504 -5.549 -3.024
+ 2: 2 42.44 2.256 -3.412 -5.246 -5.549 -1.928
+ 3: 3 42.44 3.714 -18.677 -1.454 -2.509 1.358
+
+# output_verbose_1
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ * Model class:
+ * Approach: gaussian
+ * Adaptive estimation: TRUE
+ * Number of feature-wise Shapley values: 5
+ * Number of observations to explain: 3
+
+ -- Adaptive computation started --
+
+ -- Iteration 1 -----------------------------------------------------------------
+ i Using 5 of 32 coalitions, 5 new.
+
+ -- Iteration 2 -----------------------------------------------------------------
+ i Using 10 of 32 coalitions, 4 new.
+
+ -- Iteration 3 -----------------------------------------------------------------
+ i Using 12 of 32 coalitions, 2 new.
+
+ -- Iteration 4 -----------------------------------------------------------------
+ i Using 16 of 32 coalitions, 4 new.
+
+ -- Iteration 5 -----------------------------------------------------------------
+ i Using 22 of 32 coalitions, 6 new.
+ Output
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -8.534 7.868 14.3146 0.8504 -1.8969
+ 2: 2 42.44 4.919 -4.878 -11.9086 -0.8405 -1.1714
+ 3: 3 42.44 7.447 -25.748 0.0324 -0.1976 0.8978
+
+# output_verbose_1_3
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ * Model class:
+ * Approach: gaussian
+ * Adaptive estimation: TRUE
+ * Number of feature-wise Shapley values: 5
+ * Number of observations to explain: 3
+
+ -- Adaptive computation started --
+
+ -- Iteration 1 -----------------------------------------------------------------
+ i Using 5 of 32 coalitions, 5 new.
+
+ -- Convergence info
+ i Not converged after 6 coalitions:
+ Current convergence measure: 0.33 [needs 0.02]
+ Estimated remaining coalitions: 24
+ (Concervatively) adding 10% of that (4 coalitions) in the next iteration.
+
+ -- Iteration 2 -----------------------------------------------------------------
+ i Using 10 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.2 [needs 0.02]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Iteration 3 -----------------------------------------------------------------
+ i Using 12 of 32 coalitions, 2 new.
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.077 [needs 0.02]
+ Estimated remaining coalitions: 18
+ (Concervatively) adding 20% of that (4 coalitions) in the next iteration.
+
+ -- Iteration 4 -----------------------------------------------------------------
+ i Using 16 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ i Not converged after 16 coalitions:
+ Current convergence measure: 0.046 [needs 0.02]
+ Estimated remaining coalitions: 14
+ (Concervatively) adding 30% of that (6 coalitions) in the next iteration.
+
+ -- Iteration 5 -----------------------------------------------------------------
+ i Using 22 of 32 coalitions, 6 new.
+
+ -- Convergence info
+ v Converged after 22 coalitions:
+ Convergence tolerance reached!
+ Output
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -8.534 7.868 14.3146 0.8504 -1.8969
+ 2: 2 42.44 4.919 -4.878 -11.9086 -0.8405 -1.1714
+ 3: 3 42.44 7.447 -25.748 0.0324 -0.1976 0.8978
+
+# output_verbose_1_3_4
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ * Model class:
+ * Approach: gaussian
+ * Adaptive estimation: TRUE
+ * Number of feature-wise Shapley values: 5
+ * Number of observations to explain: 3
+
+ -- Adaptive computation started --
+
+ -- Iteration 1 -----------------------------------------------------------------
+ i Using 5 of 32 coalitions, 5 new.
+
+ -- Convergence info
+ i Not converged after 6 coalitions:
+ Current convergence measure: 0.33 [needs 0.02]
+ Estimated remaining coalitions: 24
+ (Concervatively) adding 10% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -1.428 (1.74) -1.428 (1.74) 15.197 (5.43) 1.688 (0.97)
+ 2: 42.444 (0.00) -0.914 (1.10) -0.914 (1.10) -10.815 (3.23) -0.321 (0.19)
+ 3: 42.444 (0.00) -5.807 (0.72) -5.807 (0.72) 0.168 (1.95) -0.316 (1.71)
+ Day
+
+ 1: -1.428 (1.74)
+ 2: -0.914 (1.10)
+ 3: -5.807 (0.72)
+ Message
+
+ -- Iteration 2 -----------------------------------------------------------------
+ i Using 10 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ i Not converged after 10 coalitions:
+ Current convergence measure: 0.2 [needs 0.02]
+ Estimated remaining coalitions: 20
+ (Concervatively) adding 10% of that (2 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp
+
+ 1: 42.444 (0.00) -10.984 (4.19) 6.696 (3.77) 15.197 (4.21)
+ 2: 42.444 (0.00) 2.151 (2.02) -6.851 (2.61) -10.815 (2.04)
+ 3: 42.444 (0.00) 6.820 (4.76) -26.009 (7.25) 0.168 (3.47)
+ Month Day
+
+ 1: 1.688 (1.57) 0.006 (3.61)
+ 2: -0.321 (0.33) 1.957 (2.22)
+ 3: -0.316 (0.90) 1.769 (6.40)
+ Message
+
+ -- Iteration 3 -----------------------------------------------------------------
+ i Using 12 of 32 coalitions, 2 new.
+
+ -- Convergence info
+ i Not converged after 12 coalitions:
+ Current convergence measure: 0.077 [needs 0.02]
+ Estimated remaining coalitions: 18
+ (Concervatively) adding 20% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -9.803 (1.62) 7.155 (0.72) 14.738 (0.31) 1.688 (0.48)
+ 2: 42.444 (0.00) 4.188 (1.34) -6.060 (0.82) -11.606 (0.54) -0.321 (0.16)
+ 3: 42.444 (0.00) 7.531 (1.13) -25.733 (2.34) -0.109 (0.19) -0.316 (0.31)
+ Day
+
+ 1: -1.175 (1.69)
+ 2: -0.080 (1.41)
+ 3: 1.057 (2.57)
+ Message
+
+ -- Iteration 4 -----------------------------------------------------------------
+ i Using 16 of 32 coalitions, 4 new.
+
+ -- Convergence info
+ i Not converged after 16 coalitions:
+ Current convergence measure: 0.046 [needs 0.02]
+ Estimated remaining coalitions: 14
+ (Concervatively) adding 30% of that (6 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -8.850 (0.50) 7.165 (0.77) 14.627 (0.34) 1.200 (0.24)
+ 2: 42.444 (0.00) 4.909 (0.49) -5.670 (0.76) -11.676 (0.54) -0.592 (0.19)
+ 3: 42.444 (0.00) 7.453 (0.17) -25.529 (1.87) -0.083 (0.18) -0.223 (0.09)
+ Day
+
+ 1: -1.541 (0.65)
+ 2: -0.851 (0.60)
+ 3: 0.814 (1.89)
+ Message
+
+ -- Iteration 5 -----------------------------------------------------------------
+ i Using 22 of 32 coalitions, 6 new.
+
+ -- Convergence info
+ v Converged after 22 coalitions:
+ Convergence tolerance reached!
+
+ -- Final estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+
+ 1: 42.444 (0.00) -8.534 (0.45) 7.868 (0.36) 14.315 (0.27) 0.850 (0.37)
+ 2: 42.444 (0.00) 4.919 (0.36) -4.878 (0.53) -11.909 (0.38) -0.841 (0.23)
+ 3: 42.444 (0.00) 7.447 (0.16) -25.748 (0.16) 0.032 (0.13) -0.198 (0.07)
+ Day
+
+ 1: -1.897 (0.19)
+ 2: -1.171 (0.25)
+ 3: 0.898 (0.12)
+ explain_id none Solar.R Wind Temp Month Day
+
+ 1: 1 42.44 -8.534 7.868 14.3146 0.8504 -1.8969
+ 2: 2 42.44 4.919 -4.878 -11.9086 -0.8405 -1.1714
+ 3: 3 42.44 7.447 -25.748 0.0324 -0.1976 0.8978
+
+# output_verbose_1_3_4_5
+
+ Code
+ (out <- code)
+ Message
+ Success with message:
+ max_n_coalitions is NULL or larger than or 2^n_features = 32,
+ and is therefore set to 2^n_features = 32.
+
+ * Model class:
+ * Approach: gaussian
+ * Adaptive estimation: TRUE
+ * Number of feature-wise Shapley values: 5
+ * Number of observations to explain: 3
+
+ -- Adaptive computation started --
+
+ -- Iteration 1 -----------------------------------------------------------------
+ i Using 5 of 32 coalitions, 5 new.
+
+ -- Convergence info
+ i Not converged after 6 coalitions:
+ Current convergence measure: 0.33 [needs 0.02]
+ Estimated remaining coalitions: 24
+ (Concervatively) adding 10% of that (4 coalitions) in the next iteration.
+
+ -- Current estimated Shapley values (sd)
+ Output
+ none Solar.R Wind Temp Month
+