diff --git a/NEWS.md b/NEWS.md index 6ba93133a..375a4d6ac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,11 @@ # mlr3proba 0.7.1 -* Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) -* Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) +* cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) +* fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) +* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all) +* docs: Clarified the use and impact of using `t_max` in scoring rules, added examples in scoring rules and AUC scores +* feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues. +This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition. # mlr3proba 0.7.0 diff --git a/R/MeasureSurvChamblessAUC.R b/R/MeasureSurvChamblessAUC.R index 15071a89d..3d469ac62 100644 --- a/R/MeasureSurvChamblessAUC.R +++ b/R/MeasureSurvChamblessAUC.R @@ -15,6 +15,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvChamblessAUC = R6Class("MeasureSurvChamblessAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R index 65cd8134c..2fbe08eb6 100644 --- a/R/MeasureSurvCindex.R +++ b/R/MeasureSurvCindex.R @@ -69,6 +69,7 @@ #' #' # Harrell's C-index evaluated up to a specific time horizon #' p$score(msr("surv.cindex", t_max = 97)) +#' #' # Harrell's C-index evaluated up to the time corresponding to 30% of censoring #' p$score(msr("surv.cindex", p_max = 0.3)) #' diff --git a/R/MeasureSurvDCalibration.R b/R/MeasureSurvDCalibration.R index cc0ad9ae7..053afde95 100644 --- a/R/MeasureSurvDCalibration.R +++ b/R/MeasureSurvDCalibration.R @@ -3,6 +3,8 @@ #' @templateVar fullname MeasureSurvDCalibration #' #' @description +#' `r lifecycle::badge("experimental")` +#' #' This calibration method is defined by calculating the following statistic: #' \deqn{s = B/n \sum_i (P_i - n/B)^2} #' where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals), @@ -12,8 +14,8 @@ #' falls within the corresponding interval. #' This statistic assumes that censoring time is independent of death time. #' -#' A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test` -#' (\eqn{p > 0.05} if well-calibrated). +#' A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test` +#' (\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred). #' Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)}, #' meaning that *lower values* of this measure are preferred. #' @@ -23,7 +25,7 @@ #' is well-calibrated. If `chisq = FALSE` and `s` is the predicted value then you can manually #' compute the p.value with `pchisq(s, B - 1, lower.tail = FALSE)`. #' -#' NOTE: This measure is still experimental both theoretically and in implementation. Results +#' **NOTE**: This measure is still experimental both theoretically and in implementation. Results #' should therefore only be taken as an indicator of performance and not for #' conclusive judgements about model calibration. #' @@ -38,11 +40,12 @@ #' You can manually get the p-value by executing `pchisq(s, B - 1, lower.tail = FALSE)`. #' The null hypothesis is that the model is D-calibrated. #' - `truncate` (`double(1)`) \cr -#' This parameter controls the upper bound of the output statistic, -#' when `chisq` is `FALSE`. We use `truncate = Inf` by default but \eqn{10} may be sufficient -#' for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using -#' \eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus -#' less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to +#' This parameter controls the upper bound of the output statistic, when `chisq` is `FALSE`. +#' We use `truncate = Inf` by default but values between \eqn{10-16} are sufficient +#' for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the `chisq.test` using +#' the default \eqn{B = 10} buckets. +#' Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models. +#' If the number of buckets \eqn{B} changes, you probably will want to #' change the `truncate` value as well to correspond to the same p-value significance. #' Note that truncation may severely limit automated tuning with this measure. #' diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R index c062b495e..da5b577f7 100644 --- a/R/MeasureSurvGraf.R +++ b/R/MeasureSurvGraf.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @aliases MeasureSurvBrier mlr_measures_surv.brier #' @@ -25,13 +26,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISBS** (RISBS) is: #' -#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -48,10 +49,11 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999")` +#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvGraf = R6Class("MeasureSurvGraf", inherit = MeasureSurv, @@ -73,11 +75,12 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -132,7 +135,7 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/MeasureSurvHungAUC.R b/R/MeasureSurvHungAUC.R index 91141cd9b..c8cf41ce1 100644 --- a/R/MeasureSurvHungAUC.R +++ b/R/MeasureSurvHungAUC.R @@ -15,6 +15,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvHungAUC = R6Class("MeasureSurvHungAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R index 3768f2a9c..dc5e9492b 100644 --- a/R/MeasureSurvIntLogloss.R +++ b/R/MeasureSurvIntLogloss.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @description #' Calculates the **Integrated Survival Log-Likelihood** (ISLL) or Integrated @@ -23,13 +24,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISLL** (RISLL) is: #' -#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -46,10 +47,11 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999")` +#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", inherit = MeasureSurv, @@ -71,11 +73,12 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -130,7 +133,7 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/MeasureSurvLogloss.R b/R/MeasureSurvLogloss.R index ceb57dc68..a510c649e 100644 --- a/R/MeasureSurvLogloss.R +++ b/R/MeasureSurvLogloss.R @@ -10,7 +10,7 @@ #' Calculates the cross-entropy, or negative log-likelihood (NLL) or logarithmic (log), loss. #' @section Parameter details: #' - `IPCW` (`logical(1)`)\cr -#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). +#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details. #' #' @details #' The Log Loss, in the context of probabilistic predictions, is defined as the @@ -33,6 +33,9 @@ #' #' @template details_trainG #' +#' @references +#' `r format_bib("sonabend2024")` +#' #' @family Probabilistic survival measures #' @family distr survival measures #' @export diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R index 22c574a91..428fae41e 100644 --- a/R/MeasureSurvSchmid.R +++ b/R/MeasureSurvSchmid.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @description #' Calculates the **Integrated Schmid Score** (ISS), aka integrated absolute loss. @@ -22,13 +23,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISS** (RISS) is: #' -#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -36,13 +37,6 @@ #' return the average of the time-integrated observation-wise scores: #' \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N} #' -#' -#' \deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]} -#' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. -#' -#' The re-weighted ISS, RISS is given by -#' \deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]} -#' #' @template properness #' @templateVar improper_id ISS #' @templateVar proper_id RISS @@ -52,10 +46,11 @@ #' @template details_tmax #' #' @references -#' `r format_bib("schemper_2000", "schmid_2011")` +#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvSchmid = R6Class("MeasureSurvSchmid", inherit = MeasureSurv, @@ -77,11 +72,12 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -135,7 +131,7 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/MeasureSurvSongAUC.R b/R/MeasureSurvSongAUC.R index 401a18883..ee79ec939 100644 --- a/R/MeasureSurvSongAUC.R +++ b/R/MeasureSurvSongAUC.R @@ -16,6 +16,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvSongAUC = R6Class("MeasureSurvSongAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvUnoAUC.R b/R/MeasureSurvUnoAUC.R index b1565944f..9c5c3ffe4 100644 --- a/R/MeasureSurvUnoAUC.R +++ b/R/MeasureSurvUnoAUC.R @@ -16,6 +16,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvUnoAUC = R6Class("MeasureSurvUnoAUC", inherit = MeasureSurvAUC, diff --git a/R/bibentries.R b/R/bibentries.R index 7b4dccd19..f747e6ce1 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -741,5 +741,25 @@ bibentries = c( # nolint start title = "Simulating Survival Data Using the simsurv R Package", volume = "97", year = "2021" + ), + sonabend2024 = bibentry("misc", + archivePrefix = "arXiv", + arxivId = "2212.05260", + author = "Sonabend, Raphael and Zobolas, John and Kopper, Philipp and Burk, Lukas and Bender, Andreas", + month = "dec", + title = "Examining properness in the external validation of survival models with squared and logarithmic losses", + url = "https://arxiv.org/abs/2212.05260v2", + year = "2024" + ), + kvamme2023 = bibentry("article", + author = "Kvamme, Havard and Borgan, Ornulf", + issn = "1533-7928", + journal = "Journal of Machine Learning Research", + number = "2", + pages = "1--26", + title = "The Brier Score under Administrative Censoring: Problems and a Solution", + url = "http://jmlr.org/papers/v24/19-1030.html", + volume = "24", + year = "2023" ) ) diff --git a/R/integrated_scores.R b/R/integrated_scores.R index 2e6b1ba2d..2d7768e32 100644 --- a/R/integrated_scores.R +++ b/R/integrated_scores.R @@ -14,7 +14,7 @@ score_graf_schmid = function(true_times, unique_times, cdf, power = 2) { # - `t_max` > 0 # - `p_max` in [0,1] weighted_survival_score = function(loss, truth, distribution, times = NULL, - t_max = NULL, p_max = NULL, proper, train = NULL, eps, ...) { + t_max = NULL, p_max = NULL, proper, train = NULL, eps, remove_obs = FALSE) { assert_surv(truth) # test set's (times, status) test_times = truth[, "time"] @@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, rownames(cdf) = unique_times # times x obs } - # apply `t_max` cutoff to remove observations - if (tmax_apply) { + # apply `t_max` cutoff to remove observations as a preprocessing step to alleviate inflation + if (tmax_apply && remove_obs) { true_times = test_times[test_times <= t_max] true_status = test_status[test_times <= t_max] cdf = cdf[, test_times <= t_max, drop = FALSE] @@ -118,6 +118,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # use the `truth` (time, status) information from the train or test set if (is.null(train)) { + # no filtering of observations from test data: use ALL cens = survival::survfit(Surv(test_times, 1 - test_status) ~ 1) } else { # no filtering of observations from train data: use ALL @@ -128,11 +129,6 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # G(t): KM estimate of the censoring distribution cens = matrix(c(cens$time, cens$surv), ncol = 2L) - # filter G(t) time points based on `t_max` cutoff - if (tmax_apply) { - cens = cens[cens[, 1L] <= t_max, , drop = FALSE] - } - score = .c_weight_survival_score(score, true_truth, unique_times, cens, proper, eps) colnames(score) = unique_times diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R index 0d8fc023f..20425fc23 100644 --- a/man-roxygen/details_tmax.R +++ b/man-roxygen/details_tmax.R @@ -1,9 +1,23 @@ #' @section Time Cutoff Details: #' -#' If `t_max` or `p_max` is given, then \eqn{G(t)} will be fitted using **all observations** from the -#' train set (or test set) and only then the cutoff time will be applied. -#' This is to ensure that more data is used for fitting the censoring distribution via the -#' Kaplan-Meier. -#' Setting the `t_max` can help alleviate inflation of the score when `proper` is `TRUE`, -#' in cases where an observation is censored at the last observed time point. -#' This results in \eqn{G(t_{max}) = 0} and the use of `eps` instead (when `t_max` is `NULL`). +#' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is +#' truncated at the time cutoff for all observations. +#' +#' `r lifecycle::badge("experimental")` +#' +#' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**. +#' This data preprocessing step mitigates issues that arise when using IPCW +#' in cases of administrative censoring, see Kvamme et al. (2023). +#' Practically, this step, along with setting a time cutoff `t_max`, helps mitigate +#' the **inflation of the score** observed when an observation is censored at the +#' final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +#' small constant `eps` instead. +#' This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +#' for more details. +#' Note that the `t_max` and `remove_obs` parameters do not affect the estimation +#' of the censoring distribution, i.e. **always all the observations are used for estimating** \eqn{G(t)}. +#' +#' If `remove_obs = FALSE`, inflated scores may occur. While this aligns more closely +#' with the definitions presented in the original papers, it can lead to misleading +#' evaluation and poor optimization outcomes when using this score for model tuning. +#' diff --git a/man-roxygen/details_trainG.R b/man-roxygen/details_trainG.R index 2a131c29c..8c19e2a6c 100644 --- a/man-roxygen/details_trainG.R +++ b/man-roxygen/details_trainG.R @@ -1,6 +1,10 @@ #' @section Data used for Estimating Censoring Distribution: #' -#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit on training data, -#' otherwise testing data. The first is likely to reduce any bias caused by calculating -#' parts of the measure on the test data it is evaluating. The training data is automatically -#' used in scoring resamplings. +#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit using +#' **all observations** from the train set, otherwise the test set is used. +#' Using the train set is likely to reduce any bias caused by calculating parts of the +#' measure on the test data it is evaluating. +#' Also usually it means that more data is used for fitting the censoring +#' distribution \eqn{G(t)} via the Kaplan-Meier. +#' The training data is automatically used in scoring resamplings. +#' diff --git a/man-roxygen/example_auc_measures.R b/man-roxygen/example_auc_measures.R new file mode 100644 index 000000000..c229fa965 --- /dev/null +++ b/man-roxygen/example_auc_measures.R @@ -0,0 +1,27 @@ +#' <% measure = suppressWarnings(get(fullname)$new()) %> +#' +#' @examples +#' library(mlr3) +#' +#' # Define a survival Task +#' task = tsk("lung") +#' +#' # Create train and test set +#' part = partition(task) +#' +#' # Train Cox learner on the train set +#' cox = lrn("surv.coxph") +#' cox$train(task, row_ids = part$train) +#' +#' # Make predictions for the test set +#' p = cox$predict(task, row_ids = part$test) +#' +#' # Integrated AUC score +#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train, learner = cox) +#' +#' # AUC at specific time point +#' p$score(msr("<%=measure$id%>", times = 600), task = task, train_set = part$train, learner = cox) +#' +#' # Integrated AUC at specific time points +#' p$score(msr("<%=measure$id%>", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) +#' diff --git a/man-roxygen/example_scoring_rules.R b/man-roxygen/example_scoring_rules.R new file mode 100644 index 000000000..4701385b4 --- /dev/null +++ b/man-roxygen/example_scoring_rules.R @@ -0,0 +1,45 @@ +#' <% measure = suppressWarnings(get(fullname)$new()) %> +#' +#' @examples +#' library(mlr3) +#' +#' # Define a survival Task +#' task = tsk("lung") +#' +#' # Create train and test set +#' part = partition(task) +#' +#' # Train Cox learner on the train set +#' cox = lrn("surv.coxph") +#' cox$train(task, row_ids = part$train) +#' +#' # Make predictions for the test set +#' p = cox$predict(task, row_ids = part$test) +#' +#' # <%=improper_id%>, G(t) calculated using the test set +#' p$score(msr("<%=measure$id%>")) +#' +#' # <%=improper_id%>, G(t) calculated using the train set (always recommended) +#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, ERV score (comparing with KM baseline) +#' p$score(msr("<%=measure$id%>", ERV = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%> at specific time point +#' p$score(msr("<%=measure$id%>", times = 365), task = task, train_set = part$train) +#' +#' # <%=improper_id%> at multiple time points (integrated) +#' p$score(msr("<%=measure$id%>", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff +#' p$score(msr("<%=measure$id%>", t_max = 700), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff and also remove observations +#' p$score(msr("<%=measure$id%>", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff corresponding to specific proportion of censoring on the test set +#' p$score(msr("<%=measure$id%>", p_max = 0.8), task = task, train_set = part$train) +#' +#' # <%=proper_id%>, G(t) calculated using the train set +#' p$score(msr("<%=measure$id%>", proper = TRUE), task = task, train_set = part$train) +#' diff --git a/man-roxygen/param_proper.R b/man-roxygen/param_proper.R index 1e130026b..84205f74e 100644 --- a/man-roxygen/param_proper.R +++ b/man-roxygen/param_proper.R @@ -3,6 +3,8 @@ #' If `TRUE` then weights scores by the censoring distribution at #' the observed event time, which results in a strictly proper scoring #' rule if censoring and survival time distributions are independent -#' and a sufficiently large dataset is used. +#' and a sufficiently large dataset is used, see Sonabend et al. (2024). #' If `FALSE` then weights scores by the Graf method which is the #' more common usage but the loss is not proper. +#' See "Properness" section for more details. +#' diff --git a/man-roxygen/param_remove_obs.R b/man-roxygen/param_remove_obs.R new file mode 100644 index 000000000..bd7c7d5ca --- /dev/null +++ b/man-roxygen/param_remove_obs.R @@ -0,0 +1,6 @@ +#' @section Parameter details: +#' - `remove_obs` (`logical(1)`)\cr +#' Only effective when `t_max` or `p_max` is provided. Default is `FALSE`. +#' If `TRUE`, then we **remove test observations** for which the observed time (event or censoring) is strictly larger than `t_max`. +#' See "Time Cutoff Details" section for more details. +#' diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R index 268605756..a706e2442 100644 --- a/man-roxygen/param_tmax.R +++ b/man-roxygen/param_tmax.R @@ -1,8 +1,8 @@ #' @section Parameter details: #' - `t_max` (`numeric(1)`)\cr -#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +#' (truncate \eqn{S(t)}). #' Mutually exclusive with `p_max` or `times`. -#' This will effectively remove test observations for which the observed time -#' (event or censoring) is strictly more than `t_max`. -#' It's recommended to set `t_max` to avoid division by `eps`, see Details. +#' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section. #' If `t_max` is not specified, an `Inf` time horizon is assumed. +#' diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R index cb1c6d8f0..b0c956818 100644 --- a/man-roxygen/properness.R +++ b/man-roxygen/properness.R @@ -1,4 +1,5 @@ #' @section Properness: +#' `r lifecycle::badge("experimental")` #' #' <%=proper_id%> is strictly proper when the censoring distribution is independent #' of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -7,3 +8,6 @@ #' Results may be very different if many observations are censored at the last #' observed time due to division by \eqn{1/eps} in `proper = TRUE`. #' +#' See Sonabend et al. (2024) for more details. +#' The use of `proper = TRUE` is considered experimental and should be used with caution. +#' diff --git a/man/mlr_measures_surv.chambless_auc.Rd b/man/mlr_measures_surv.chambless_auc.Rd index 6b8476ace..88e68bca9 100644 --- a/man/mlr_measures_surv.chambless_auc.Rd +++ b/man/mlr_measures_surv.chambless_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Chambless LE, Diao G (2006). \dQuote{Estimation of time-dependent area under the ROC curve for long-term risk prediction.} diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd index 5697b9764..9fba1e787 100644 --- a/man/mlr_measures_surv.cindex.Rd +++ b/man/mlr_measures_surv.cindex.Rd @@ -105,6 +105,7 @@ p$score(msr("surv.cindex", weight_meth = "G2"), # Harrell's C-index evaluated up to a specific time horizon p$score(msr("surv.cindex", t_max = 97)) + # Harrell's C-index evaluated up to the time corresponding to 30\% of censoring p$score(msr("surv.cindex", p_max = 0.3)) diff --git a/man/mlr_measures_surv.dcalib.Rd b/man/mlr_measures_surv.dcalib.Rd index e0a7779e0..d096a8c35 100644 --- a/man/mlr_measures_surv.dcalib.Rd +++ b/man/mlr_measures_surv.dcalib.Rd @@ -5,6 +5,8 @@ \alias{MeasureSurvDCalibration} \title{D-Calibration Survival Measure} \description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + This calibration method is defined by calculating the following statistic: \deqn{s = B/n \sum_i (P_i - n/B)^2} where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals), @@ -14,8 +16,8 @@ of observations in the \eqn{i}th interval. An observation is assigned to the falls within the corresponding interval. This statistic assumes that censoring time is independent of death time. -A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test} -(\eqn{p > 0.05} if well-calibrated). +A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test} +(\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred). Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)}, meaning that \emph{lower values} of this measure are preferred. } @@ -25,7 +27,7 @@ The former is useful for model comparison whereas the latter is useful for deter is well-calibrated. If \code{chisq = FALSE} and \code{s} is the predicted value then you can manually compute the p.value with \code{pchisq(s, B - 1, lower.tail = FALSE)}. -NOTE: This measure is still experimental both theoretically and in implementation. Results +\strong{NOTE}: This measure is still experimental both theoretically and in implementation. Results should therefore only be taken as an indicator of performance and not for conclusive judgements about model calibration. } @@ -72,11 +74,12 @@ Default is \code{FALSE} and returns the statistic \code{s}. You can manually get the p-value by executing \code{pchisq(s, B - 1, lower.tail = FALSE)}. The null hypothesis is that the model is D-calibrated. \item \code{truncate} (\code{double(1)}) \cr -This parameter controls the upper bound of the output statistic, -when \code{chisq} is \code{FALSE}. We use \code{truncate = Inf} by default but \eqn{10} may be sufficient -for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using -\eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus -less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to +This parameter controls the upper bound of the output statistic, when \code{chisq} is \code{FALSE}. +We use \code{truncate = Inf} by default but values between \eqn{10-16} are sufficient +for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the \code{chisq.test} using +the default \eqn{B = 10} buckets. +Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models. +If the number of buckets \eqn{B} changes, you probably will want to change the \code{truncate} value as well to correspond to the same p-value significance. Note that truncation may severely limit automated tuning with this measure. } diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index 4d2924dfc..37b26f853 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -17,13 +17,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISBS} (RISBS) is: -\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -54,6 +54,7 @@ msr("surv.graf") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -85,11 +86,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -126,9 +126,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -146,10 +147,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISBS is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -157,6 +167,9 @@ ISBS is never proper. Use \code{proper = FALSE} for ISBS and \code{proper = TRUE} for RISBS. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -196,29 +209,98 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +truncated at the time cutoff for all observations. + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISBS, G(t) calculated using the test set +p$score(msr("surv.graf")) + +# ISBS, G(t) calculated using the train set (always recommended) +p$score(msr("surv.graf"), task = task, train_set = part$train) + +# ISBS, ERV score (comparing with KM baseline) +p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train) + +# ISBS at specific time point +p$score(msr("surv.graf", times = 365), task = task, train_set = part$train) + +# ISBS at multiple time points (integrated) +p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISBS, use time cutoff +p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train) + +# ISBS, use time cutoff and also remove observations +p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train) + +# RISBS, G(t) calculated using the train set +p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train) + +} \references{ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545. \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Havard, Borgan, Ornulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.hung_auc.Rd b/man/mlr_measures_surv.hung_auc.Rd index e5207db4e..3f05b93ce 100644 --- a/man/mlr_measures_surv.hung_auc.Rd +++ b/man/mlr_measures_surv.hung_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Hung H, Chiang C (2010). \dQuote{Estimation methods for time-dependent AUC models with survival data.} diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index 5a6d7af85..9498d433c 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -15,13 +15,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISLL} (RISLL) is: -\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -52,6 +52,7 @@ msr("surv.intlogloss") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -83,11 +84,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -124,9 +124,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -144,10 +145,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISLL is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -155,6 +165,9 @@ ISLL is never proper. Use \code{proper = FALSE} for ISLL and \code{proper = TRUE} for RISLL. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -194,29 +207,98 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +truncated at the time cutoff for all observations. + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISLL, G(t) calculated using the test set +p$score(msr("surv.intlogloss")) + +# ISLL, G(t) calculated using the train set (always recommended) +p$score(msr("surv.intlogloss"), task = task, train_set = part$train) + +# ISLL, ERV score (comparing with KM baseline) +p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train) + +# ISLL at specific time point +p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train) + +# ISLL at multiple time points (integrated) +p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISLL, use time cutoff +p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train) + +# ISLL, use time cutoff and also remove observations +p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train) + +# RISLL, G(t) calculated using the train set +p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train) + +} \references{ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545. \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Havard, Borgan, Ornulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.logloss.Rd b/man/mlr_measures_surv.logloss.Rd index 245364758..bc7241647 100644 --- a/man/mlr_measures_surv.logloss.Rd +++ b/man/mlr_measures_surv.logloss.Rd @@ -86,19 +86,27 @@ Default is \code{FALSE}. \itemize{ \item \code{IPCW} (\code{logical(1)})\cr -If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). +If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details. } } \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } +\references{ +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. +} \seealso{ Other survival measures: \code{\link{mlr_measures_surv.calib_alpha}}, diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index d01cd26ea..03723f90a 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -14,25 +14,19 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISS} (RISS) is: -\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. To get a single score across all \eqn{N} observations of the test set, we return the average of the time-integrated observation-wise scores: \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N} - -\deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]} -where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. - -The re-weighted ISS, RISS is given by -\deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]} } \section{Dictionary}{ @@ -57,6 +51,7 @@ msr("surv.schmid") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -88,11 +83,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -129,9 +123,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -149,10 +144,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISS is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -160,6 +164,9 @@ ISS is never proper. Use \code{proper = FALSE} for ISS and \code{proper = TRUE} for RISS. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -199,24 +206,84 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +truncated at the time cutoff for all observations. + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISS, G(t) calculated using the test set +p$score(msr("surv.schmid")) + +# ISS, G(t) calculated using the train set (always recommended) +p$score(msr("surv.schmid"), task = task, train_set = part$train) + +# ISS, ERV score (comparing with KM baseline) +p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train) + +# ISS at specific time point +p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train) + +# ISS at multiple time points (integrated) +p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISS, use time cutoff +p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train) + +# ISS, use time cutoff and also remove observations +p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISS, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train) + +# RISS, G(t) calculated using the train set +p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train) + +} \references{ Schemper, Michael, Henderson, Robin (2000). \dQuote{Predictive Accuracy and Explained Variation in Cox Regression.} @@ -227,6 +294,15 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011). \dQuote{A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error.} \emph{Biometrics}, \bold{67}(2), 524--535. \doi{10.1111/j.1541-0420.2010.01459.x}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Havard, Borgan, Ornulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.song_auc.Rd b/man/mlr_measures_surv.song_auc.Rd index f57df5545..deceab503 100644 --- a/man/mlr_measures_surv.song_auc.Rd +++ b/man/mlr_measures_surv.song_auc.Rd @@ -69,6 +69,32 @@ incident TPR, \code{cumulative} refers to cumulative TPR. } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Song, Xiao, Zhou, Xiao-Hua (2008). \dQuote{A semiparametric approach for the covariate specific ROC curve with survival outcome.} diff --git a/man/mlr_measures_surv.uno_auc.Rd b/man/mlr_measures_surv.uno_auc.Rd index 5536ad7ad..b4206ac29 100644 --- a/man/mlr_measures_surv.uno_auc.Rd +++ b/man/mlr_measures_surv.uno_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Uno H, Cai T, Tian L, Wei LJ (2007). \dQuote{Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models.} diff --git a/src/survival_scores.cpp b/src/survival_scores.cpp index 52c2f396b..6d569f9bb 100644 --- a/src/survival_scores.cpp +++ b/src/survival_scores.cpp @@ -91,13 +91,12 @@ NumericMatrix c_weight_survival_score(const NumericMatrix& score, k = 1; break; } else if (times[i] >= cens_times[l] && - (l == cens_times.length() - 1 || - times[i] < cens_times[l + 1])) { + (l == cens_times.length() - 1 || times[i] < cens_times[l + 1])) { k = cens_surv[l]; - // k == 0 only if last obsv censored, therefore mat is set to 0 - // anyway This division by eps can cause inflation of the score, - // due to a very large value for a particular (i-obs, j-time) use - // 't_max' to filter 'cens' in that case + // k == 0 only if last obs censored, therefore mat is set to 0 anyway + // This division by eps can cause inflation of the score, + // due to a very large value for a particular (i-obs, j-time) + // Use 't_max' to filter 'cens' in that case if (k == 0) { k = eps; } diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R index c40630bf5..486e3fd3a 100644 --- a/tests/testthat/test_mlr_measures.R +++ b/tests/testthat/test_mlr_measures.R @@ -163,15 +163,18 @@ test_that("graf: t_max, p_max, times", { t_max = 100 times_flt = times[times <= t_max] # keep only times until the `t_max` m0 = p$score(msr("surv.graf")) # uses all test time points - m1 = p$score(msr("surv.graf", times = times_flt)) # uses times_flt + m1 = p$score(msr("surv.graf", times = times_flt)) # uses `times_flt` m2 = p$score(msr("surv.graf", t_max = t_max)) # 100 + m22 = p$score(msr("surv.graf", t_max = t_max, remove_obs = TRUE)) # 100 m3 = p$score(msr("surv.graf", t_max = max(times))) # 104 - m4 = p$score(msr("surv.graf", t_max = max(times) + 1)) # 105 + m4 = p$score(msr("surv.graf", t_max = max(times) + 10)) # 105 # different time points considered expect_true(m0 != m1) - # same time points are used, but `t_max` also removes observations - expect_true(m1 != m2) + # same time points are used, and no removal of observations (original Graf score) + expect_equal(m1, m2) + # same time points are used, but observations with `t > t_max` are removed + expect_true(m2 != m22) # different `t_max` => different time points used expect_true(m2 != m3) # different `t_max` but after the max evaluation time point, so result stays the same