From aea626f1e457accc9e7737cba4712245a98b0dff Mon Sep 17 00:00:00 2001 From: john Date: Wed, 11 Dec 2024 23:28:35 +0100 Subject: [PATCH 01/24] fix styling --- src/survival_scores.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/survival_scores.cpp b/src/survival_scores.cpp index 52c2f396b..6d569f9bb 100644 --- a/src/survival_scores.cpp +++ b/src/survival_scores.cpp @@ -91,13 +91,12 @@ NumericMatrix c_weight_survival_score(const NumericMatrix& score, k = 1; break; } else if (times[i] >= cens_times[l] && - (l == cens_times.length() - 1 || - times[i] < cens_times[l + 1])) { + (l == cens_times.length() - 1 || times[i] < cens_times[l + 1])) { k = cens_surv[l]; - // k == 0 only if last obsv censored, therefore mat is set to 0 - // anyway This division by eps can cause inflation of the score, - // due to a very large value for a particular (i-obs, j-time) use - // 't_max' to filter 'cens' in that case + // k == 0 only if last obs censored, therefore mat is set to 0 anyway + // This division by eps can cause inflation of the score, + // due to a very large value for a particular (i-obs, j-time) + // Use 't_max' to filter 'cens' in that case if (k == 0) { k = eps; } From 0cf1567e1cc8226f2705e084ebcdd806fb5224cb Mon Sep 17 00:00:00 2001 From: john Date: Wed, 11 Dec 2024 23:32:22 +0100 Subject: [PATCH 02/24] fix: t_max does not remove observations for ISBS --- R/integrated_scores.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/integrated_scores.R b/R/integrated_scores.R index 2e6b1ba2d..eeda03f6b 100644 --- a/R/integrated_scores.R +++ b/R/integrated_scores.R @@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, rownames(cdf) = unique_times # times x obs } - # apply `t_max` cutoff to remove observations - if (tmax_apply) { + # apply `t_max` cutoff to remove observations for RISBS + if (tmax_apply && proper) { true_times = test_times[test_times <= t_max] true_status = test_status[test_times <= t_max] cdf = cdf[, test_times <= t_max, drop = FALSE] @@ -128,8 +128,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # G(t): KM estimate of the censoring distribution cens = matrix(c(cens$time, cens$surv), ncol = 2L) - # filter G(t) time points based on `t_max` cutoff - if (tmax_apply) { + # filter G(t) time points based on `t_max` cutoff for ISBS + if (tmax_apply && proper) { cens = cens[cens[, 1L] <= t_max, , drop = FALSE] } From c4eb657ef00526c6da88bfb53c08d8c1bd4714d2 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 11 Dec 2024 23:32:56 +0100 Subject: [PATCH 03/24] fix test --- tests/testthat/test_mlr_measures.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R index c40630bf5..e04abf5eb 100644 --- a/tests/testthat/test_mlr_measures.R +++ b/tests/testthat/test_mlr_measures.R @@ -170,8 +170,8 @@ test_that("graf: t_max, p_max, times", { # different time points considered expect_true(m0 != m1) - # same time points are used, but `t_max` also removes observations - expect_true(m1 != m2) + # same time points are used (`t_max` does NOT remove observations) + expect_equal(m1, m2) # different `t_max` => different time points used expect_true(m2 != m3) # different `t_max` but after the max evaluation time point, so result stays the same From b297f7996beb4174c60f4aa3669108531c37603e Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 10:21:13 +0100 Subject: [PATCH 04/24] small comment fix --- R/integrated_scores.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/integrated_scores.R b/R/integrated_scores.R index eeda03f6b..018b372cd 100644 --- a/R/integrated_scores.R +++ b/R/integrated_scores.R @@ -128,7 +128,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # G(t): KM estimate of the censoring distribution cens = matrix(c(cens$time, cens$surv), ncol = 2L) - # filter G(t) time points based on `t_max` cutoff for ISBS + # filter G(t) time points based on `t_max` cutoff for RISBS if (tmax_apply && proper) { cens = cens[cens[, 1L] <= t_max, , drop = FALSE] } From 565b31a4a2e28b362a6433d897b66690b7fd307c Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 11:15:11 +0100 Subject: [PATCH 05/24] add scoring rules paper --- R/bibentries.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/bibentries.R b/R/bibentries.R index 7b4dccd19..779e04438 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -741,5 +741,14 @@ bibentries = c( # nolint start title = "Simulating Survival Data Using the simsurv R Package", volume = "97", year = "2021" + ), + sonabend2024 = bibentry("misc", + archivePrefix = "arXiv", + arxivId = "2212.05260", + author = "Sonabend, Raphael and Zobolas, John and Kopper, Philipp and Burk, Lukas and Bender, Andreas", + month = "dec", + title = "Examining properness in the external validation of survival models with squared and logarithmic losses", + url = "https://arxiv.org/abs/2212.05260v2", + year = "2024" ) ) From 7dfcc610caa8fdcd7139b74de291a8685159c414 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 11:22:20 +0100 Subject: [PATCH 06/24] update measure docs --- R/MeasureSurvGraf.R | 6 ++-- R/MeasureSurvIntLogloss.R | 6 ++-- R/MeasureSurvLogloss.R | 5 ++- R/MeasureSurvSchmid.R | 13 ++------ man-roxygen/details_tmax.R | 14 ++++----- man-roxygen/details_trainG.R | 12 ++++--- man-roxygen/param_proper.R | 4 ++- man-roxygen/param_tmax.R | 4 +-- man-roxygen/properness.R | 4 +++ man/mlr_measures_surv.graf.Rd | 43 +++++++++++++++---------- man/mlr_measures_surv.intlogloss.Rd | 43 +++++++++++++++---------- man/mlr_measures_surv.logloss.Rd | 18 ++++++++--- man/mlr_measures_surv.schmid.Rd | 49 ++++++++++++++++------------- 13 files changed, 131 insertions(+), 90 deletions(-) diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R index c062b495e..53d5b5c18 100644 --- a/R/MeasureSurvGraf.R +++ b/R/MeasureSurvGraf.R @@ -25,13 +25,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISBS** (RISBS) is: #' -#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -48,7 +48,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999")` +#' `r format_bib("graf_1999", "sonabend2024")` #' #' @family Probabilistic survival measures #' @family distr survival measures diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R index 3768f2a9c..138626317 100644 --- a/R/MeasureSurvIntLogloss.R +++ b/R/MeasureSurvIntLogloss.R @@ -23,13 +23,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISLL** (RISLL) is: #' -#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -46,7 +46,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999")` +#' `r format_bib("graf_1999", "sonabend2024")` #' #' @family Probabilistic survival measures #' @family distr survival measures diff --git a/R/MeasureSurvLogloss.R b/R/MeasureSurvLogloss.R index ceb57dc68..a510c649e 100644 --- a/R/MeasureSurvLogloss.R +++ b/R/MeasureSurvLogloss.R @@ -10,7 +10,7 @@ #' Calculates the cross-entropy, or negative log-likelihood (NLL) or logarithmic (log), loss. #' @section Parameter details: #' - `IPCW` (`logical(1)`)\cr -#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). +#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details. #' #' @details #' The Log Loss, in the context of probabilistic predictions, is defined as the @@ -33,6 +33,9 @@ #' #' @template details_trainG #' +#' @references +#' `r format_bib("sonabend2024")` +#' #' @family Probabilistic survival measures #' @family distr survival measures #' @export diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R index 22c574a91..2fd886def 100644 --- a/R/MeasureSurvSchmid.R +++ b/R/MeasureSurvSchmid.R @@ -22,13 +22,13 @@ #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across #' the time dimension up to the time cutoff \eqn{\tau^*}, is: #' -#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} #' #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. #' #' The **re-weighted ISS** (RISS) is: #' -#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -36,13 +36,6 @@ #' return the average of the time-integrated observation-wise scores: #' \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N} #' -#' -#' \deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]} -#' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. -#' -#' The re-weighted ISS, RISS is given by -#' \deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]} -#' #' @template properness #' @templateVar improper_id ISS #' @templateVar proper_id RISS @@ -52,7 +45,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("schemper_2000", "schmid_2011")` +#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024")` #' #' @family Probabilistic survival measures #' @family distr survival measures diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R index 0d8fc023f..5e9065e37 100644 --- a/man-roxygen/details_tmax.R +++ b/man-roxygen/details_tmax.R @@ -1,9 +1,9 @@ #' @section Time Cutoff Details: #' -#' If `t_max` or `p_max` is given, then \eqn{G(t)} will be fitted using **all observations** from the -#' train set (or test set) and only then the cutoff time will be applied. -#' This is to ensure that more data is used for fitting the censoring distribution via the -#' Kaplan-Meier. -#' Setting the `t_max` can help alleviate inflation of the score when `proper` is `TRUE`, -#' in cases where an observation is censored at the last observed time point. -#' This results in \eqn{G(t_{max}) = 0} and the use of `eps` instead (when `t_max` is `NULL`). +#' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is +#' filtered up to the time cutoff for all observations. +#' Also, when `proper = TRUE`, \eqn{G(t)} will be filtered up to the cutoff time as well. +#' This helps alleviate inflation of the score in cases where an observation is +#' censored at the last observed time point and no time cutoff is given, which results in +#' \eqn{G(t_{max}) = 0} and the use of `eps` instead. +#' diff --git a/man-roxygen/details_trainG.R b/man-roxygen/details_trainG.R index 2a131c29c..8c19e2a6c 100644 --- a/man-roxygen/details_trainG.R +++ b/man-roxygen/details_trainG.R @@ -1,6 +1,10 @@ #' @section Data used for Estimating Censoring Distribution: #' -#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit on training data, -#' otherwise testing data. The first is likely to reduce any bias caused by calculating -#' parts of the measure on the test data it is evaluating. The training data is automatically -#' used in scoring resamplings. +#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit using +#' **all observations** from the train set, otherwise the test set is used. +#' Using the train set is likely to reduce any bias caused by calculating parts of the +#' measure on the test data it is evaluating. +#' Also usually it means that more data is used for fitting the censoring +#' distribution \eqn{G(t)} via the Kaplan-Meier. +#' The training data is automatically used in scoring resamplings. +#' diff --git a/man-roxygen/param_proper.R b/man-roxygen/param_proper.R index 1e130026b..84205f74e 100644 --- a/man-roxygen/param_proper.R +++ b/man-roxygen/param_proper.R @@ -3,6 +3,8 @@ #' If `TRUE` then weights scores by the censoring distribution at #' the observed event time, which results in a strictly proper scoring #' rule if censoring and survival time distributions are independent -#' and a sufficiently large dataset is used. +#' and a sufficiently large dataset is used, see Sonabend et al. (2024). #' If `FALSE` then weights scores by the Graf method which is the #' more common usage but the loss is not proper. +#' See "Properness" section for more details. +#' diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R index 268605756..7f8761de7 100644 --- a/man-roxygen/param_tmax.R +++ b/man-roxygen/param_tmax.R @@ -3,6 +3,6 @@ #' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. #' Mutually exclusive with `p_max` or `times`. #' This will effectively remove test observations for which the observed time -#' (event or censoring) is strictly more than `t_max`. -#' It's recommended to set `t_max` to avoid division by `eps`, see Details. +#' (event or censoring) is strictly more than `t_max` in the case of `proper = TRUE`. +#' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section. #' If `t_max` is not specified, an `Inf` time horizon is assumed. diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R index cb1c6d8f0..d4b149149 100644 --- a/man-roxygen/properness.R +++ b/man-roxygen/properness.R @@ -1,4 +1,5 @@ #' @section Properness: +#' `r lifecycle::badge("experimental")` #' #' <%=proper_id%> is strictly proper when the censoring distribution is independent #' of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -7,3 +8,6 @@ #' Results may be very different if many observations are censored at the last #' observed time due to division by \eqn{1/eps} in `proper = TRUE`. #' +#' See Sonabend et al. (2024) for more details. +#' The use of `proper = TRUE` should be used with caution and it still considered as an experimental metric. +#' diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index 4d2924dfc..af2ba6f01 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -17,13 +17,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISBS} (RISBS) is: -\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -88,8 +88,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -126,9 +126,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -150,6 +151,7 @@ Default is \code{FALSE}. \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISBS is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -157,6 +159,9 @@ ISBS is never proper. Use \code{proper = FALSE} for ISBS and \code{proper = TRUE} for RISBS. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. } \section{Time points used for evaluation}{ @@ -196,22 +201,24 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +filtered up to the time cutoff for all observations. +Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. +This helps alleviate inflation of the score in cases where an observation is +censored at the last observed time point and no time cutoff is given, which results in +\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. } \references{ @@ -219,6 +226,10 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545. \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index 5a6d7af85..36fcb8b56 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -15,13 +15,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISLL} (RISLL) is: -\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -86,8 +86,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -124,9 +124,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -148,6 +149,7 @@ Default is \code{FALSE}. \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISLL is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -155,6 +157,9 @@ ISLL is never proper. Use \code{proper = FALSE} for ISLL and \code{proper = TRUE} for RISLL. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. } \section{Time points used for evaluation}{ @@ -194,22 +199,24 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +filtered up to the time cutoff for all observations. +Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. +This helps alleviate inflation of the score in cases where an observation is +censored at the last observed time point and no time cutoff is given, which results in +\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. } \references{ @@ -217,6 +224,10 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545. \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.logloss.Rd b/man/mlr_measures_surv.logloss.Rd index 245364758..bc7241647 100644 --- a/man/mlr_measures_surv.logloss.Rd +++ b/man/mlr_measures_surv.logloss.Rd @@ -86,19 +86,27 @@ Default is \code{FALSE}. \itemize{ \item \code{IPCW} (\code{logical(1)})\cr -If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). +If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details. } } \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } +\references{ +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. +} \seealso{ Other survival measures: \code{\link{mlr_measures_surv.calib_alpha}}, diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index d01cd26ea..ff64b953b 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -14,25 +14,19 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across the time dimension up to the time cutoff \eqn{\tau^*}, is: -\deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} +\deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau} where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISS} (RISS) is: -\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau} +\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. To get a single score across all \eqn{N} observations of the test set, we return the average of the time-integrated observation-wise scores: \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N} - -\deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]} -where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. - -The re-weighted ISS, RISS is given by -\deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]} } \section{Dictionary}{ @@ -91,8 +85,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max}. -It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details. +(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -129,9 +123,10 @@ Default is \code{FALSE} (returns the mean). If \code{TRUE} then weights scores by the censoring distribution at the observed event time, which results in a strictly proper scoring rule if censoring and survival time distributions are independent -and a sufficiently large dataset is used. +and a sufficiently large dataset is used, see Sonabend et al. (2024). If \code{FALSE} then weights scores by the Graf method which is the more common usage but the loss is not proper. +See "Properness" section for more details. } @@ -153,6 +148,7 @@ Default is \code{FALSE}. \section{Properness}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} RISS is strictly proper when the censoring distribution is independent of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset. @@ -160,6 +156,9 @@ ISS is never proper. Use \code{proper = FALSE} for ISS and \code{proper = TRUE} for RISS. Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. + +See Sonabend et al. (2024) for more details. +The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. } \section{Time points used for evaluation}{ @@ -199,22 +198,24 @@ not used). \section{Data used for Estimating Censoring Distribution}{ -If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings. +If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using +\strong{all observations} from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \eqn{G(t)} via the Kaplan-Meier. +The training data is automatically used in scoring resamplings. } \section{Time Cutoff Details}{ -If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE}, -in cases where an observation is censored at the last observed time point. -This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}). +If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is +filtered up to the time cutoff for all observations. +Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. +This helps alleviate inflation of the score in cases where an observation is +censored at the last observed time point and no time cutoff is given, which results in +\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. } \references{ @@ -227,6 +228,10 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011). \dQuote{A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error.} \emph{Biometrics}, \bold{67}(2), 524--535. \doi{10.1111/j.1541-0420.2010.01459.x}. + +Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} +\url{https://arxiv.org/abs/2212.05260v2}. } \seealso{ Other survival measures: From e0bd34a1062914fce5a756603e2e256173113ed8 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 11:27:41 +0100 Subject: [PATCH 07/24] update news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 6ba93133a..81ebafbb2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) +* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores # mlr3proba 0.7.0 From 30df58f94402869861d38a59e2685056c7997643 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 12:42:27 +0100 Subject: [PATCH 08/24] update news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 81ebafbb2..8cb3f40e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) -* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores +* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores. Updated scoring rules formulas and docs accordingly. # mlr3proba 0.7.0 From a2ed3caba98290a3095a3af2a446380719e5a700 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 13:08:06 +0100 Subject: [PATCH 09/24] better doc for d-calibration measure --- R/MeasureSurvDCalibration.R | 19 +++++++++++-------- man/mlr_measures_surv.dcalib.Rd | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/R/MeasureSurvDCalibration.R b/R/MeasureSurvDCalibration.R index cc0ad9ae7..053afde95 100644 --- a/R/MeasureSurvDCalibration.R +++ b/R/MeasureSurvDCalibration.R @@ -3,6 +3,8 @@ #' @templateVar fullname MeasureSurvDCalibration #' #' @description +#' `r lifecycle::badge("experimental")` +#' #' This calibration method is defined by calculating the following statistic: #' \deqn{s = B/n \sum_i (P_i - n/B)^2} #' where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals), @@ -12,8 +14,8 @@ #' falls within the corresponding interval. #' This statistic assumes that censoring time is independent of death time. #' -#' A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test` -#' (\eqn{p > 0.05} if well-calibrated). +#' A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test` +#' (\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred). #' Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)}, #' meaning that *lower values* of this measure are preferred. #' @@ -23,7 +25,7 @@ #' is well-calibrated. If `chisq = FALSE` and `s` is the predicted value then you can manually #' compute the p.value with `pchisq(s, B - 1, lower.tail = FALSE)`. #' -#' NOTE: This measure is still experimental both theoretically and in implementation. Results +#' **NOTE**: This measure is still experimental both theoretically and in implementation. Results #' should therefore only be taken as an indicator of performance and not for #' conclusive judgements about model calibration. #' @@ -38,11 +40,12 @@ #' You can manually get the p-value by executing `pchisq(s, B - 1, lower.tail = FALSE)`. #' The null hypothesis is that the model is D-calibrated. #' - `truncate` (`double(1)`) \cr -#' This parameter controls the upper bound of the output statistic, -#' when `chisq` is `FALSE`. We use `truncate = Inf` by default but \eqn{10} may be sufficient -#' for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using -#' \eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus -#' less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to +#' This parameter controls the upper bound of the output statistic, when `chisq` is `FALSE`. +#' We use `truncate = Inf` by default but values between \eqn{10-16} are sufficient +#' for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the `chisq.test` using +#' the default \eqn{B = 10} buckets. +#' Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models. +#' If the number of buckets \eqn{B} changes, you probably will want to #' change the `truncate` value as well to correspond to the same p-value significance. #' Note that truncation may severely limit automated tuning with this measure. #' diff --git a/man/mlr_measures_surv.dcalib.Rd b/man/mlr_measures_surv.dcalib.Rd index e0a7779e0..d096a8c35 100644 --- a/man/mlr_measures_surv.dcalib.Rd +++ b/man/mlr_measures_surv.dcalib.Rd @@ -5,6 +5,8 @@ \alias{MeasureSurvDCalibration} \title{D-Calibration Survival Measure} \description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + This calibration method is defined by calculating the following statistic: \deqn{s = B/n \sum_i (P_i - n/B)^2} where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals), @@ -14,8 +16,8 @@ of observations in the \eqn{i}th interval. An observation is assigned to the falls within the corresponding interval. This statistic assumes that censoring time is independent of death time. -A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test} -(\eqn{p > 0.05} if well-calibrated). +A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test} +(\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred). Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)}, meaning that \emph{lower values} of this measure are preferred. } @@ -25,7 +27,7 @@ The former is useful for model comparison whereas the latter is useful for deter is well-calibrated. If \code{chisq = FALSE} and \code{s} is the predicted value then you can manually compute the p.value with \code{pchisq(s, B - 1, lower.tail = FALSE)}. -NOTE: This measure is still experimental both theoretically and in implementation. Results +\strong{NOTE}: This measure is still experimental both theoretically and in implementation. Results should therefore only be taken as an indicator of performance and not for conclusive judgements about model calibration. } @@ -72,11 +74,12 @@ Default is \code{FALSE} and returns the statistic \code{s}. You can manually get the p-value by executing \code{pchisq(s, B - 1, lower.tail = FALSE)}. The null hypothesis is that the model is D-calibrated. \item \code{truncate} (\code{double(1)}) \cr -This parameter controls the upper bound of the output statistic, -when \code{chisq} is \code{FALSE}. We use \code{truncate = Inf} by default but \eqn{10} may be sufficient -for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using -\eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus -less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to +This parameter controls the upper bound of the output statistic, when \code{chisq} is \code{FALSE}. +We use \code{truncate = Inf} by default but values between \eqn{10-16} are sufficient +for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the \code{chisq.test} using +the default \eqn{B = 10} buckets. +Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models. +If the number of buckets \eqn{B} changes, you probably will want to change the \code{truncate} value as well to correspond to the same p-value significance. Note that truncation may severely limit automated tuning with this measure. } From 06b1d891bf81f693313a8a25fac81f962bb27290 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 11:56:20 +0100 Subject: [PATCH 10/24] add Kvamme paper --- R/bibentries.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/bibentries.R b/R/bibentries.R index 779e04438..1d670cffa 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -750,5 +750,16 @@ bibentries = c( # nolint start title = "Examining properness in the external validation of survival models with squared and logarithmic losses", url = "https://arxiv.org/abs/2212.05260v2", year = "2024" + ), + kvamme2023 = bibentry("article", + author = "Kvamme, Håvard and Borgan, Ørnulf", + issn = "1533-7928", + journal = "Journal of Machine Learning Research", + number = "2", + pages = "1--26", + title = "The Brier Score under Administrative Censoring: Problems and a Solution", + url = "http://jmlr.org/papers/v24/19-1030.html", + volume = "24", + year = "2023" ) ) From bd196c3089510f6749bedadc292b59a8f9e6ea32 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:05:05 +0100 Subject: [PATCH 11/24] fix: always remove obs when t_max is applied + do not filter G(t) --- R/integrated_scores.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/R/integrated_scores.R b/R/integrated_scores.R index 018b372cd..92b861ab1 100644 --- a/R/integrated_scores.R +++ b/R/integrated_scores.R @@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, rownames(cdf) = unique_times # times x obs } - # apply `t_max` cutoff to remove observations for RISBS - if (tmax_apply && proper) { + # apply `t_max` cutoff to remove observations + if (tmax_apply) { true_times = test_times[test_times <= t_max] true_status = test_status[test_times <= t_max] cdf = cdf[, test_times <= t_max, drop = FALSE] @@ -128,11 +128,6 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # G(t): KM estimate of the censoring distribution cens = matrix(c(cens$time, cens$surv), ncol = 2L) - # filter G(t) time points based on `t_max` cutoff for RISBS - if (tmax_apply && proper) { - cens = cens[cens[, 1L] <= t_max, , drop = FALSE] - } - score = .c_weight_survival_score(score, true_truth, unique_times, cens, proper, eps) colnames(score) = unique_times From a868c54fb7d3be26a7e8ec9b3957aa4290d8ff27 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:05:47 +0100 Subject: [PATCH 12/24] remove indicator function from score formulas + add citation --- R/MeasureSurvGraf.R | 4 ++-- R/MeasureSurvIntLogloss.R | 4 ++-- R/MeasureSurvSchmid.R | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R index 53d5b5c18..d44d72747 100644 --- a/R/MeasureSurvGraf.R +++ b/R/MeasureSurvGraf.R @@ -31,7 +31,7 @@ #' #' The **re-weighted ISBS** (RISBS) is: #' -#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -48,7 +48,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999", "sonabend2024")` +#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R index 138626317..66d74afd5 100644 --- a/R/MeasureSurvIntLogloss.R +++ b/R/MeasureSurvIntLogloss.R @@ -29,7 +29,7 @@ #' #' The **re-weighted ISLL** (RISLL) is: #' -#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -46,7 +46,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("graf_1999", "sonabend2024")` +#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R index 2fd886def..119e3fef9 100644 --- a/R/MeasureSurvSchmid.R +++ b/R/MeasureSurvSchmid.R @@ -28,7 +28,7 @@ #' #' The **re-weighted ISS** (RISS) is: #' -#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} #' #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. #' @@ -45,7 +45,7 @@ #' @template details_tmax #' #' @references -#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024")` +#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024", "kvamme2023")` #' #' @family Probabilistic survival measures #' @family distr survival measures From 1bd54df31e3327ca9378222ecdaca2a8394f59b2 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:06:24 +0100 Subject: [PATCH 13/24] doc update --- man-roxygen/details_tmax.R | 10 +++++++--- man-roxygen/param_tmax.R | 4 ++-- man-roxygen/properness.R | 2 +- man/mlr_measures_surv.graf.Rd | 23 ++++++++++++++++------- man/mlr_measures_surv.intlogloss.Rd | 23 ++++++++++++++++------- man/mlr_measures_surv.schmid.Rd | 23 ++++++++++++++++------- 6 files changed, 58 insertions(+), 27 deletions(-) diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R index 5e9065e37..12504bc80 100644 --- a/man-roxygen/details_tmax.R +++ b/man-roxygen/details_tmax.R @@ -2,8 +2,12 @@ #' #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is #' filtered up to the time cutoff for all observations. -#' Also, when `proper = TRUE`, \eqn{G(t)} will be filtered up to the cutoff time as well. -#' This helps alleviate inflation of the score in cases where an observation is +#' Also, **observations with observed times** \eqn{t > t_{max}} **are removed**. +#' This is a data processing step to alleviate the problems that arise when using IPCW +#' in cases of administrative censoring, see Kvamme et al. (2023). +#' It also helps alleviate **inflation of the score** in cases where an observation is #' censored at the last observed time point and no time cutoff is given, which results in -#' \eqn{G(t_{max}) = 0} and the use of `eps` instead. +#' \eqn{G(t) = 0} and the use of `eps` instead. +#' The proper version of this score is more affected by this issue, see Sonabend +#' et al. (2024) for more details. #' diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R index 7f8761de7..c4a1c091c 100644 --- a/man-roxygen/param_tmax.R +++ b/man-roxygen/param_tmax.R @@ -2,7 +2,7 @@ #' - `t_max` (`numeric(1)`)\cr #' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. #' Mutually exclusive with `p_max` or `times`. -#' This will effectively remove test observations for which the observed time -#' (event or censoring) is strictly more than `t_max` in the case of `proper = TRUE`. +#' This will effectively **remove test observations** for which the observed time +#' (event or censoring) is strictly more than `t_max`. #' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section. #' If `t_max` is not specified, an `Inf` time horizon is assumed. diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R index d4b149149..b0c956818 100644 --- a/man-roxygen/properness.R +++ b/man-roxygen/properness.R @@ -9,5 +9,5 @@ #' observed time due to division by \eqn{1/eps} in `proper = TRUE`. #' #' See Sonabend et al. (2024) for more details. -#' The use of `proper = TRUE` should be used with caution and it still considered as an experimental metric. +#' The use of `proper = TRUE` is considered experimental and should be used with caution. #' diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index af2ba6f01..658adda48 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -23,7 +23,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISBS} (RISBS) is: -\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -87,8 +87,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \item \code{t_max} (\code{numeric(1)})\cr Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +This will effectively \strong{remove test observations} for which the observed time +(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -161,7 +161,7 @@ Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. See Sonabend et al. (2024) for more details. -The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -215,10 +215,14 @@ The training data is automatically used in scoring resamplings. If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is filtered up to the time cutoff for all observations. -Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. -This helps alleviate inflation of the score in cases where an observation is +Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This is a data processing step to alleviate the problems that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +It also helps alleviate \strong{inflation of the score} in cases where an observation is censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. +\eqn{G(t) = 0} and the use of \code{eps} instead. +The proper version of this score is more affected by this issue, see Sonabend +et al. (2024) for more details. } \references{ @@ -230,6 +234,11 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Håvard, Borgan, Ørnulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index 36fcb8b56..d355521b0 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -21,7 +21,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISLL} (RISLL) is: -\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -85,8 +85,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \item \code{t_max} (\code{numeric(1)})\cr Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +This will effectively \strong{remove test observations} for which the observed time +(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -159,7 +159,7 @@ Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. See Sonabend et al. (2024) for more details. -The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -213,10 +213,14 @@ The training data is automatically used in scoring resamplings. If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is filtered up to the time cutoff for all observations. -Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. -This helps alleviate inflation of the score in cases where an observation is +Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This is a data processing step to alleviate the problems that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +It also helps alleviate \strong{inflation of the score} in cases where an observation is censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. +\eqn{G(t) = 0} and the use of \code{eps} instead. +The proper version of this score is more affected by this issue, see Sonabend +et al. (2024) for more details. } \references{ @@ -228,6 +232,11 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Håvard, Borgan, Ørnulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index ff64b953b..4785d5605 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -20,7 +20,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution. The \strong{re-weighted ISS} (RISS) is: -\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} +\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}} which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject. @@ -84,8 +84,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \item \code{t_max} (\code{numeric(1)})\cr Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. Mutually exclusive with \code{p_max} or \code{times}. -This will effectively remove test observations for which the observed time -(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}. +This will effectively \strong{remove test observations} for which the observed time +(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -158,7 +158,7 @@ Results may be very different if many observations are censored at the last observed time due to division by \eqn{1/eps} in \code{proper = TRUE}. See Sonabend et al. (2024) for more details. -The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric. +The use of \code{proper = TRUE} is considered experimental and should be used with caution. } \section{Time points used for evaluation}{ @@ -212,10 +212,14 @@ The training data is automatically used in scoring resamplings. If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is filtered up to the time cutoff for all observations. -Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well. -This helps alleviate inflation of the score in cases where an observation is +Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This is a data processing step to alleviate the problems that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +It also helps alleviate \strong{inflation of the score} in cases where an observation is censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t_{max}) = 0} and the use of \code{eps} instead. +\eqn{G(t) = 0} and the use of \code{eps} instead. +The proper version of this score is more affected by this issue, see Sonabend +et al. (2024) for more details. } \references{ @@ -232,6 +236,11 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011). Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. + +Kvamme, Håvard, Borgan, Ørnulf (2023). +\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} +\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. +ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. } \seealso{ Other survival measures: From d76d0929e53263448b209bb1bc8e8e2a53d9a147 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:08:04 +0100 Subject: [PATCH 14/24] revert test --- tests/testthat/test_mlr_measures.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R index e04abf5eb..c40630bf5 100644 --- a/tests/testthat/test_mlr_measures.R +++ b/tests/testthat/test_mlr_measures.R @@ -170,8 +170,8 @@ test_that("graf: t_max, p_max, times", { # different time points considered expect_true(m0 != m1) - # same time points are used (`t_max` does NOT remove observations) - expect_equal(m1, m2) + # same time points are used, but `t_max` also removes observations + expect_true(m1 != m2) # different `t_max` => different time points used expect_true(m2 != m3) # different `t_max` but after the max evaluation time point, so result stays the same From 31928405c5d79256568b8aebe81b919e99a71425 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:13:51 +0100 Subject: [PATCH 15/24] update news --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 8cb3f40e6..5370a0a74 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,8 @@ * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) -* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores. Updated scoring rules formulas and docs accordingly. +* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evalution at all) +* docs: Clarified the use and impact of using `t_max` in scoring rules (removing observations as a processing step to alleviate IPCW issues). # mlr3proba 0.7.0 From c60f7c73f3199ca7375c4b80bf87fab2d2421dc0 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 13 Dec 2024 12:26:08 +0100 Subject: [PATCH 16/24] update norsk names --- R/bibentries.R | 2 +- man/mlr_measures_surv.graf.Rd | 2 +- man/mlr_measures_surv.intlogloss.Rd | 2 +- man/mlr_measures_surv.schmid.Rd | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/bibentries.R b/R/bibentries.R index 1d670cffa..f747e6ce1 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -752,7 +752,7 @@ bibentries = c( # nolint start year = "2024" ), kvamme2023 = bibentry("article", - author = "Kvamme, Håvard and Borgan, Ørnulf", + author = "Kvamme, Havard and Borgan, Ornulf", issn = "1533-7928", journal = "Journal of Machine Learning Research", number = "2", diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index 658adda48..fc1dce752 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -235,7 +235,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. -Kvamme, Håvard, Borgan, Ørnulf (2023). +Kvamme, Havard, Borgan, Ornulf (2023). \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index d355521b0..ca46a278f 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -233,7 +233,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. -Kvamme, Håvard, Borgan, Ørnulf (2023). +Kvamme, Havard, Borgan, Ornulf (2023). \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index 4785d5605..92edb33ec 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -237,7 +237,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.} \url{https://arxiv.org/abs/2212.05260v2}. -Kvamme, Håvard, Borgan, Ørnulf (2023). +Kvamme, Havard, Borgan, Ornulf (2023). \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.} \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26. ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}. From f1afdcb79fe46c44135bd7895e70368370e2041d Mon Sep 17 00:00:00 2001 From: john Date: Tue, 17 Dec 2024 20:21:15 +0100 Subject: [PATCH 17/24] add `remove_obs` parameter for scoring rules --- R/MeasureSurvGraf.R | 8 +++++--- R/MeasureSurvIntLogloss.R | 8 +++++--- R/MeasureSurvSchmid.R | 8 +++++--- R/integrated_scores.R | 7 ++++--- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R index d44d72747..3bf36a881 100644 --- a/R/MeasureSurvGraf.R +++ b/R/MeasureSurvGraf.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @aliases MeasureSurvBrier mlr_measures_surv.brier #' @@ -73,11 +74,12 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -132,7 +134,7 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R index 66d74afd5..7053e770b 100644 --- a/R/MeasureSurvIntLogloss.R +++ b/R/MeasureSurvIntLogloss.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @description #' Calculates the **Integrated Survival Log-Likelihood** (ISLL) or Integrated @@ -71,11 +72,12 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -130,7 +132,7 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R index 119e3fef9..0b75e37a4 100644 --- a/R/MeasureSurvSchmid.R +++ b/R/MeasureSurvSchmid.R @@ -11,6 +11,7 @@ #' @templateVar eps 1e-3 #' @template param_eps #' @template param_erv +#' @template param_remove_obs #' #' @description #' Calculates the **Integrated Schmid Score** (ISS), aka integrated absolute loss. @@ -70,11 +71,12 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid", se = p_lgl(default = FALSE), proper = p_lgl(default = FALSE), eps = p_dbl(0, 1, default = 1e-3), - ERV = p_lgl(default = FALSE) + ERV = p_lgl(default = FALSE), + remove_obs = p_lgl(default = FALSE) ) ps$set_values( integrated = TRUE, method = 2L, se = FALSE, - proper = FALSE, eps = 1e-3, ERV = ERV + proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE ) range = if (ERV) c(-Inf, 1) else c(0, Inf) @@ -128,7 +130,7 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid", truth = prediction$truth, distribution = prediction$data$distr, times = times, t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train, - eps = ps$eps + eps = ps$eps, remove_obs = ps$remove_obs ) if (ps$se) { diff --git a/R/integrated_scores.R b/R/integrated_scores.R index 92b861ab1..2d7768e32 100644 --- a/R/integrated_scores.R +++ b/R/integrated_scores.R @@ -14,7 +14,7 @@ score_graf_schmid = function(true_times, unique_times, cdf, power = 2) { # - `t_max` > 0 # - `p_max` in [0,1] weighted_survival_score = function(loss, truth, distribution, times = NULL, - t_max = NULL, p_max = NULL, proper, train = NULL, eps, ...) { + t_max = NULL, p_max = NULL, proper, train = NULL, eps, remove_obs = FALSE) { assert_surv(truth) # test set's (times, status) test_times = truth[, "time"] @@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, rownames(cdf) = unique_times # times x obs } - # apply `t_max` cutoff to remove observations - if (tmax_apply) { + # apply `t_max` cutoff to remove observations as a preprocessing step to alleviate inflation + if (tmax_apply && remove_obs) { true_times = test_times[test_times <= t_max] true_status = test_status[test_times <= t_max] cdf = cdf[, test_times <= t_max, drop = FALSE] @@ -118,6 +118,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL, # use the `truth` (time, status) information from the train or test set if (is.null(train)) { + # no filtering of observations from test data: use ALL cens = survival::survfit(Surv(test_times, 1 - test_status) ~ 1) } else { # no filtering of observations from train data: use ALL From 1bfb836df9ae7fc456d343c0ccb3475ae6f7be11 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 17 Dec 2024 20:50:02 +0100 Subject: [PATCH 18/24] update docs --- man-roxygen/details_tmax.R | 25 ++++++++++++------ man-roxygen/param_remove_obs.R | 6 +++++ man-roxygen/param_tmax.R | 6 ++--- man/mlr_measures_surv.graf.Rd | 39 +++++++++++++++++++++-------- man/mlr_measures_surv.intlogloss.Rd | 39 +++++++++++++++++++++-------- man/mlr_measures_surv.schmid.Rd | 39 +++++++++++++++++++++-------- 6 files changed, 110 insertions(+), 44 deletions(-) create mode 100644 man-roxygen/param_remove_obs.R diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R index 12504bc80..0086500a8 100644 --- a/man-roxygen/details_tmax.R +++ b/man-roxygen/details_tmax.R @@ -1,13 +1,22 @@ #' @section Time Cutoff Details: +#' `r lifecycle::badge("experimental")` #' #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is -#' filtered up to the time cutoff for all observations. -#' Also, **observations with observed times** \eqn{t > t_{max}} **are removed**. -#' This is a data processing step to alleviate the problems that arise when using IPCW +#' truncated at the time cutoff for all observations. +#' +#' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**. +#' This data preprocessing step mitigates issues that arise when using IPCW #' in cases of administrative censoring, see Kvamme et al. (2023). -#' It also helps alleviate **inflation of the score** in cases where an observation is -#' censored at the last observed time point and no time cutoff is given, which results in -#' \eqn{G(t) = 0} and the use of `eps` instead. -#' The proper version of this score is more affected by this issue, see Sonabend -#' et al. (2024) for more details. +#' Practically, this step, along with setting a time cutoff `t_max`, helps mitigate +#' the **inflation of the score** observed when an observation is censored at the +#' final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +#' small constant `eps` instead. +#' This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +#' for more details. +#' Note that the `t_max` and `remove_obs` parameters do not affect the estimation +#' of the censoring distribution, i.e. **always all the observations are used for estimating** \eqn{G(t)}. +#' +#' If `remove_obs = FALSE`, inflated scores may occur. While this aligns more closely +#' with the definitions presented in the original papers, it can lead to misleading +#' evaluation and poor optimization outcomes when using this score for model tuning. #' diff --git a/man-roxygen/param_remove_obs.R b/man-roxygen/param_remove_obs.R new file mode 100644 index 000000000..bd7c7d5ca --- /dev/null +++ b/man-roxygen/param_remove_obs.R @@ -0,0 +1,6 @@ +#' @section Parameter details: +#' - `remove_obs` (`logical(1)`)\cr +#' Only effective when `t_max` or `p_max` is provided. Default is `FALSE`. +#' If `TRUE`, then we **remove test observations** for which the observed time (event or censoring) is strictly larger than `t_max`. +#' See "Time Cutoff Details" section for more details. +#' diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R index c4a1c091c..a706e2442 100644 --- a/man-roxygen/param_tmax.R +++ b/man-roxygen/param_tmax.R @@ -1,8 +1,8 @@ #' @section Parameter details: #' - `t_max` (`numeric(1)`)\cr -#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +#' (truncate \eqn{S(t)}). #' Mutually exclusive with `p_max` or `times`. -#' This will effectively **remove test observations** for which the observed time -#' (event or censoring) is strictly more than `t_max`. #' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section. #' If `t_max` is not specified, an `Inf` time horizon is assumed. +#' diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index fc1dce752..7f070941f 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -54,6 +54,7 @@ msr("surv.graf") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -85,10 +86,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively \strong{remove test observations} for which the observed time -(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -147,6 +147,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ @@ -212,17 +220,26 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is -filtered up to the time cutoff for all observations. -Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. -This is a data processing step to alleviate the problems that arise when using IPCW +truncated at the time cutoff for all observations. + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). -It also helps alleviate \strong{inflation of the score} in cases where an observation is -censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t) = 0} and the use of \code{eps} instead. -The proper version of this score is more affected by this issue, see Sonabend -et al. (2024) for more details. +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } \references{ diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index ca46a278f..425d32869 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -52,6 +52,7 @@ msr("surv.intlogloss") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -83,10 +84,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively \strong{remove test observations} for which the observed time -(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -145,6 +145,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ @@ -210,17 +218,26 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is -filtered up to the time cutoff for all observations. -Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. -This is a data processing step to alleviate the problems that arise when using IPCW +truncated at the time cutoff for all observations. + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). -It also helps alleviate \strong{inflation of the score} in cases where an observation is -censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t) = 0} and the use of \code{eps} instead. -The proper version of this score is more affected by this issue, see Sonabend -et al. (2024) for more details. +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } \references{ diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index 92edb33ec..ca25fe310 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -51,6 +51,7 @@ msr("surv.schmid") proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr eps \tab numeric \tab 0.001 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr + remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr } } @@ -82,10 +83,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc \itemize{ \item \code{t_max} (\code{numeric(1)})\cr -Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to. +Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to +(truncate \eqn{S(t)}). Mutually exclusive with \code{p_max} or \code{times}. -This will effectively \strong{remove test observations} for which the observed time -(event or censoring) is strictly more than \code{t_max}. It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section. If \code{t_max} is not specified, an \code{Inf} time horizon is assumed. } @@ -144,6 +144,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which means the score is standardized against a Kaplan-Meier baseline. Default is \code{FALSE}. } + + +\itemize{ +\item \code{remove_obs} (\code{logical(1)})\cr +Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}. +If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}. +See "Time Cutoff Details" section for more details. +} } \section{Properness}{ @@ -209,17 +217,26 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is -filtered up to the time cutoff for all observations. -Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. -This is a data processing step to alleviate the problems that arise when using IPCW +truncated at the time cutoff for all observations. + +Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. +This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). -It also helps alleviate \strong{inflation of the score} in cases where an observation is -censored at the last observed time point and no time cutoff is given, which results in -\eqn{G(t) = 0} and the use of \code{eps} instead. -The proper version of this score is more affected by this issue, see Sonabend -et al. (2024) for more details. +Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate +the \strong{inflation of the score} observed when an observation is censored at the +final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a +small constant \code{eps} instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation +of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}. + +If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning. } \references{ From f290c968d86d8a78f06e764d3ec53a9201e696ac Mon Sep 17 00:00:00 2001 From: john Date: Tue, 17 Dec 2024 20:50:25 +0100 Subject: [PATCH 19/24] update tests --- tests/testthat/test_mlr_measures.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R index c40630bf5..486e3fd3a 100644 --- a/tests/testthat/test_mlr_measures.R +++ b/tests/testthat/test_mlr_measures.R @@ -163,15 +163,18 @@ test_that("graf: t_max, p_max, times", { t_max = 100 times_flt = times[times <= t_max] # keep only times until the `t_max` m0 = p$score(msr("surv.graf")) # uses all test time points - m1 = p$score(msr("surv.graf", times = times_flt)) # uses times_flt + m1 = p$score(msr("surv.graf", times = times_flt)) # uses `times_flt` m2 = p$score(msr("surv.graf", t_max = t_max)) # 100 + m22 = p$score(msr("surv.graf", t_max = t_max, remove_obs = TRUE)) # 100 m3 = p$score(msr("surv.graf", t_max = max(times))) # 104 - m4 = p$score(msr("surv.graf", t_max = max(times) + 1)) # 105 + m4 = p$score(msr("surv.graf", t_max = max(times) + 10)) # 105 # different time points considered expect_true(m0 != m1) - # same time points are used, but `t_max` also removes observations - expect_true(m1 != m2) + # same time points are used, and no removal of observations (original Graf score) + expect_equal(m1, m2) + # same time points are used, but observations with `t > t_max` are removed + expect_true(m2 != m22) # different `t_max` => different time points used expect_true(m2 != m3) # different `t_max` but after the max evaluation time point, so result stays the same From 37d11a70fdce66c455a546df51db9e9cc5dfcdbe Mon Sep 17 00:00:00 2001 From: john Date: Tue, 17 Dec 2024 20:51:52 +0100 Subject: [PATCH 20/24] update news --- NEWS.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5370a0a74..cf916ff88 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,11 @@ # mlr3proba 0.7.1 -* Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) -* Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) -* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evalution at all) -* docs: Clarified the use and impact of using `t_max` in scoring rules (removing observations as a processing step to alleviate IPCW issues). +* cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) +* fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) +* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all) +* docs: Clarified the use and impact of using `t_max` in scoring rules +* feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues. +This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition. # mlr3proba 0.7.0 From 1ddba16ad14895d104fde7483f6d5e0f0fc0b113 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 18 Dec 2024 18:31:04 +0100 Subject: [PATCH 21/24] change experimental badge position --- man-roxygen/details_tmax.R | 3 ++- man/mlr_measures_surv.graf.Rd | 3 ++- man/mlr_measures_surv.intlogloss.Rd | 3 ++- man/mlr_measures_surv.schmid.Rd | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R index 0086500a8..20425fc23 100644 --- a/man-roxygen/details_tmax.R +++ b/man-roxygen/details_tmax.R @@ -1,9 +1,10 @@ #' @section Time Cutoff Details: -#' `r lifecycle::badge("experimental")` #' #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is #' truncated at the time cutoff for all observations. #' +#' `r lifecycle::badge("experimental")` +#' #' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**. #' This data preprocessing step mitigates issues that arise when using IPCW #' in cases of administrative censoring, see Kvamme et al. (2023). diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index 7f070941f..d2e35b91f 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -220,11 +220,12 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is truncated at the time cutoff for all observations. +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index 425d32869..45c3791b8 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -218,11 +218,12 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is truncated at the time cutoff for all observations. +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index ca25fe310..6adceed86 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -217,11 +217,12 @@ The training data is automatically used in scoring resamplings. \section{Time Cutoff Details}{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is truncated at the time cutoff for all observations. +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}. This data preprocessing step mitigates issues that arise when using IPCW in cases of administrative censoring, see Kvamme et al. (2023). From 20aa3601b3426d06e5f1bcd89c99a4ac9a142891 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 18 Dec 2024 19:26:21 +0100 Subject: [PATCH 22/24] add examples for scoring rules --- R/MeasureSurvCindex.R | 1 + R/MeasureSurvGraf.R | 1 + R/MeasureSurvIntLogloss.R | 1 + R/MeasureSurvSchmid.R | 1 + man-roxygen/example_scoring_rules.R | 45 +++++++++++++++++++++++++++++ man/mlr_measures_surv.cindex.Rd | 1 + man/mlr_measures_surv.graf.Rd | 44 ++++++++++++++++++++++++++++ man/mlr_measures_surv.intlogloss.Rd | 44 ++++++++++++++++++++++++++++ man/mlr_measures_surv.schmid.Rd | 44 ++++++++++++++++++++++++++++ 9 files changed, 182 insertions(+) create mode 100644 man-roxygen/example_scoring_rules.R diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R index 65cd8134c..2fbe08eb6 100644 --- a/R/MeasureSurvCindex.R +++ b/R/MeasureSurvCindex.R @@ -69,6 +69,7 @@ #' #' # Harrell's C-index evaluated up to a specific time horizon #' p$score(msr("surv.cindex", t_max = 97)) +#' #' # Harrell's C-index evaluated up to the time corresponding to 30% of censoring #' p$score(msr("surv.cindex", p_max = 0.3)) #' diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R index 3bf36a881..da5b577f7 100644 --- a/R/MeasureSurvGraf.R +++ b/R/MeasureSurvGraf.R @@ -53,6 +53,7 @@ #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvGraf = R6Class("MeasureSurvGraf", inherit = MeasureSurv, diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R index 7053e770b..dc5e9492b 100644 --- a/R/MeasureSurvIntLogloss.R +++ b/R/MeasureSurvIntLogloss.R @@ -51,6 +51,7 @@ #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss", inherit = MeasureSurv, diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R index 0b75e37a4..428fae41e 100644 --- a/R/MeasureSurvSchmid.R +++ b/R/MeasureSurvSchmid.R @@ -50,6 +50,7 @@ #' #' @family Probabilistic survival measures #' @family distr survival measures +#' @template example_scoring_rules #' @export MeasureSurvSchmid = R6Class("MeasureSurvSchmid", inherit = MeasureSurv, diff --git a/man-roxygen/example_scoring_rules.R b/man-roxygen/example_scoring_rules.R new file mode 100644 index 000000000..4701385b4 --- /dev/null +++ b/man-roxygen/example_scoring_rules.R @@ -0,0 +1,45 @@ +#' <% measure = suppressWarnings(get(fullname)$new()) %> +#' +#' @examples +#' library(mlr3) +#' +#' # Define a survival Task +#' task = tsk("lung") +#' +#' # Create train and test set +#' part = partition(task) +#' +#' # Train Cox learner on the train set +#' cox = lrn("surv.coxph") +#' cox$train(task, row_ids = part$train) +#' +#' # Make predictions for the test set +#' p = cox$predict(task, row_ids = part$test) +#' +#' # <%=improper_id%>, G(t) calculated using the test set +#' p$score(msr("<%=measure$id%>")) +#' +#' # <%=improper_id%>, G(t) calculated using the train set (always recommended) +#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, ERV score (comparing with KM baseline) +#' p$score(msr("<%=measure$id%>", ERV = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%> at specific time point +#' p$score(msr("<%=measure$id%>", times = 365), task = task, train_set = part$train) +#' +#' # <%=improper_id%> at multiple time points (integrated) +#' p$score(msr("<%=measure$id%>", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff +#' p$score(msr("<%=measure$id%>", t_max = 700), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff and also remove observations +#' p$score(msr("<%=measure$id%>", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) +#' +#' # <%=improper_id%>, use time cutoff corresponding to specific proportion of censoring on the test set +#' p$score(msr("<%=measure$id%>", p_max = 0.8), task = task, train_set = part$train) +#' +#' # <%=proper_id%>, G(t) calculated using the train set +#' p$score(msr("<%=measure$id%>", proper = TRUE), task = task, train_set = part$train) +#' diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd index 5697b9764..9fba1e787 100644 --- a/man/mlr_measures_surv.cindex.Rd +++ b/man/mlr_measures_surv.cindex.Rd @@ -105,6 +105,7 @@ p$score(msr("surv.cindex", weight_meth = "G2"), # Harrell's C-index evaluated up to a specific time horizon p$score(msr("surv.cindex", t_max = 97)) + # Harrell's C-index evaluated up to the time corresponding to 30\% of censoring p$score(msr("surv.cindex", p_max = 0.3)) diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd index d2e35b91f..37b26f853 100644 --- a/man/mlr_measures_surv.graf.Rd +++ b/man/mlr_measures_surv.graf.Rd @@ -243,6 +243,50 @@ with the definitions presented in the original papers, it can lead to misleading evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISBS, G(t) calculated using the test set +p$score(msr("surv.graf")) + +# ISBS, G(t) calculated using the train set (always recommended) +p$score(msr("surv.graf"), task = task, train_set = part$train) + +# ISBS, ERV score (comparing with KM baseline) +p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train) + +# ISBS at specific time point +p$score(msr("surv.graf", times = 365), task = task, train_set = part$train) + +# ISBS at multiple time points (integrated) +p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISBS, use time cutoff +p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train) + +# ISBS, use time cutoff and also remove observations +p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train) + +# RISBS, G(t) calculated using the train set +p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train) + +} \references{ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd index 45c3791b8..9498d433c 100644 --- a/man/mlr_measures_surv.intlogloss.Rd +++ b/man/mlr_measures_surv.intlogloss.Rd @@ -241,6 +241,50 @@ with the definitions presented in the original papers, it can lead to misleading evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISLL, G(t) calculated using the test set +p$score(msr("surv.intlogloss")) + +# ISLL, G(t) calculated using the train set (always recommended) +p$score(msr("surv.intlogloss"), task = task, train_set = part$train) + +# ISLL, ERV score (comparing with KM baseline) +p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train) + +# ISLL at specific time point +p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train) + +# ISLL at multiple time points (integrated) +p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISLL, use time cutoff +p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train) + +# ISLL, use time cutoff and also remove observations +p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train) + +# RISLL, G(t) calculated using the train set +p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train) + +} \references{ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999). \dQuote{Assessment and comparison of prognostic classification schemes for survival data.} diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd index 6adceed86..03723f90a 100644 --- a/man/mlr_measures_surv.schmid.Rd +++ b/man/mlr_measures_surv.schmid.Rd @@ -240,6 +240,50 @@ with the definitions presented in the original papers, it can lead to misleading evaluation and poor optimization outcomes when using this score for model tuning. } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# ISS, G(t) calculated using the test set +p$score(msr("surv.schmid")) + +# ISS, G(t) calculated using the train set (always recommended) +p$score(msr("surv.schmid"), task = task, train_set = part$train) + +# ISS, ERV score (comparing with KM baseline) +p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train) + +# ISS at specific time point +p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train) + +# ISS at multiple time points (integrated) +p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train) + +# ISS, use time cutoff +p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train) + +# ISS, use time cutoff and also remove observations +p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train) + +# ISS, use time cutoff corresponding to specific proportion of censoring on the test set +p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train) + +# RISS, G(t) calculated using the train set +p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train) + +} \references{ Schemper, Michael, Henderson, Robin (2000). \dQuote{Predictive Accuracy and Explained Variation in Cox Regression.} From 14cf2b282c4a18eb783632983b28747e851f5e40 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 18 Dec 2024 20:05:21 +0100 Subject: [PATCH 23/24] add examples for AUC measures --- R/MeasureSurvChamblessAUC.R | 1 + R/MeasureSurvHungAUC.R | 1 + R/MeasureSurvSongAUC.R | 1 + R/MeasureSurvUnoAUC.R | 1 + man-roxygen/example_auc_measures.R | 27 ++++++++++++++++++++++++++ man/mlr_measures_surv.chambless_auc.Rd | 26 +++++++++++++++++++++++++ man/mlr_measures_surv.hung_auc.Rd | 26 +++++++++++++++++++++++++ man/mlr_measures_surv.song_auc.Rd | 26 +++++++++++++++++++++++++ man/mlr_measures_surv.uno_auc.Rd | 26 +++++++++++++++++++++++++ 9 files changed, 135 insertions(+) create mode 100644 man-roxygen/example_auc_measures.R diff --git a/R/MeasureSurvChamblessAUC.R b/R/MeasureSurvChamblessAUC.R index 15071a89d..3d469ac62 100644 --- a/R/MeasureSurvChamblessAUC.R +++ b/R/MeasureSurvChamblessAUC.R @@ -15,6 +15,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvChamblessAUC = R6Class("MeasureSurvChamblessAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvHungAUC.R b/R/MeasureSurvHungAUC.R index 91141cd9b..c8cf41ce1 100644 --- a/R/MeasureSurvHungAUC.R +++ b/R/MeasureSurvHungAUC.R @@ -15,6 +15,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvHungAUC = R6Class("MeasureSurvHungAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvSongAUC.R b/R/MeasureSurvSongAUC.R index 401a18883..ee79ec939 100644 --- a/R/MeasureSurvSongAUC.R +++ b/R/MeasureSurvSongAUC.R @@ -16,6 +16,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvSongAUC = R6Class("MeasureSurvSongAUC", inherit = MeasureSurvAUC, diff --git a/R/MeasureSurvUnoAUC.R b/R/MeasureSurvUnoAUC.R index b1565944f..9c5c3ffe4 100644 --- a/R/MeasureSurvUnoAUC.R +++ b/R/MeasureSurvUnoAUC.R @@ -16,6 +16,7 @@ #' #' @family AUC survival measures #' @family lp survival measures +#' @template example_auc_measures #' @export MeasureSurvUnoAUC = R6Class("MeasureSurvUnoAUC", inherit = MeasureSurvAUC, diff --git a/man-roxygen/example_auc_measures.R b/man-roxygen/example_auc_measures.R new file mode 100644 index 000000000..c229fa965 --- /dev/null +++ b/man-roxygen/example_auc_measures.R @@ -0,0 +1,27 @@ +#' <% measure = suppressWarnings(get(fullname)$new()) %> +#' +#' @examples +#' library(mlr3) +#' +#' # Define a survival Task +#' task = tsk("lung") +#' +#' # Create train and test set +#' part = partition(task) +#' +#' # Train Cox learner on the train set +#' cox = lrn("surv.coxph") +#' cox$train(task, row_ids = part$train) +#' +#' # Make predictions for the test set +#' p = cox$predict(task, row_ids = part$test) +#' +#' # Integrated AUC score +#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train, learner = cox) +#' +#' # AUC at specific time point +#' p$score(msr("<%=measure$id%>", times = 600), task = task, train_set = part$train, learner = cox) +#' +#' # Integrated AUC at specific time points +#' p$score(msr("<%=measure$id%>", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) +#' diff --git a/man/mlr_measures_surv.chambless_auc.Rd b/man/mlr_measures_surv.chambless_auc.Rd index 6b8476ace..88e68bca9 100644 --- a/man/mlr_measures_surv.chambless_auc.Rd +++ b/man/mlr_measures_surv.chambless_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Chambless LE, Diao G (2006). \dQuote{Estimation of time-dependent area under the ROC curve for long-term risk prediction.} diff --git a/man/mlr_measures_surv.hung_auc.Rd b/man/mlr_measures_surv.hung_auc.Rd index e5207db4e..3f05b93ce 100644 --- a/man/mlr_measures_surv.hung_auc.Rd +++ b/man/mlr_measures_surv.hung_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Hung H, Chiang C (2010). \dQuote{Estimation methods for time-dependent AUC models with survival data.} diff --git a/man/mlr_measures_surv.song_auc.Rd b/man/mlr_measures_surv.song_auc.Rd index f57df5545..deceab503 100644 --- a/man/mlr_measures_surv.song_auc.Rd +++ b/man/mlr_measures_surv.song_auc.Rd @@ -69,6 +69,32 @@ incident TPR, \code{cumulative} refers to cumulative TPR. } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Song, Xiao, Zhou, Xiao-Hua (2008). \dQuote{A semiparametric approach for the covariate specific ROC curve with survival outcome.} diff --git a/man/mlr_measures_surv.uno_auc.Rd b/man/mlr_measures_surv.uno_auc.Rd index 5536ad7ad..b4206ac29 100644 --- a/man/mlr_measures_surv.uno_auc.Rd +++ b/man/mlr_measures_surv.uno_auc.Rd @@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc } } +\examples{ +library(mlr3) + +# Define a survival Task +task = tsk("lung") + +# Create train and test set +part = partition(task) + +# Train Cox learner on the train set +cox = lrn("surv.coxph") +cox$train(task, row_ids = part$train) + +# Make predictions for the test set +p = cox$predict(task, row_ids = part$test) + +# Integrated AUC score +p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox) + +# AUC at specific time point +p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox) + +# Integrated AUC at specific time points +p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox) + +} \references{ Uno H, Cai T, Tian L, Wei LJ (2007). \dQuote{Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models.} From e3766e80367e45ad922438620130713b7b317d3d Mon Sep 17 00:00:00 2001 From: john Date: Wed, 18 Dec 2024 20:07:50 +0100 Subject: [PATCH 24/24] update news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index cf916ff88..375a4d6ac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ * cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414) * fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type) * fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all) -* docs: Clarified the use and impact of using `t_max` in scoring rules +* docs: Clarified the use and impact of using `t_max` in scoring rules, added examples in scoring rules and AUC scores * feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues. This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition.