From aea626f1e457accc9e7737cba4712245a98b0dff Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 11 Dec 2024 23:28:35 +0100
Subject: [PATCH 01/24] fix styling

---
 src/survival_scores.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/survival_scores.cpp b/src/survival_scores.cpp
index 52c2f396b..6d569f9bb 100644
--- a/src/survival_scores.cpp
+++ b/src/survival_scores.cpp
@@ -91,13 +91,12 @@ NumericMatrix c_weight_survival_score(const NumericMatrix& score,
               k = 1;
               break;
             } else if (times[i] >= cens_times[l] &&
-              (l == cens_times.length() - 1 ||
-              times[i] < cens_times[l + 1])) {
+              (l == cens_times.length() - 1 || times[i] < cens_times[l + 1])) {
               k = cens_surv[l];
-              // k == 0 only if last obsv censored, therefore mat is set to 0
-              // anyway This division by eps can cause inflation of the score,
-              // due to a very large value for a particular (i-obs, j-time) use
-              // 't_max' to filter 'cens' in that case
+              // k == 0 only if last obs censored, therefore mat is set to 0 anyway
+              // This division by eps can cause inflation of the score,
+              // due to a very large value for a particular (i-obs, j-time)
+              // Use 't_max' to filter 'cens' in that case
               if (k == 0) {
                 k = eps;
               }

From 0cf1567e1cc8226f2705e084ebcdd806fb5224cb Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 11 Dec 2024 23:32:22 +0100
Subject: [PATCH 02/24] fix: t_max does not remove observations for ISBS

---
 R/integrated_scores.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/integrated_scores.R b/R/integrated_scores.R
index 2e6b1ba2d..eeda03f6b 100644
--- a/R/integrated_scores.R
+++ b/R/integrated_scores.R
@@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
     rownames(cdf) = unique_times # times x obs
   }
 
-  # apply `t_max` cutoff to remove observations
-  if (tmax_apply) {
+  # apply `t_max` cutoff to remove observations for RISBS
+  if (tmax_apply && proper) {
     true_times = test_times[test_times <= t_max]
     true_status = test_status[test_times <= t_max]
     cdf = cdf[, test_times <= t_max, drop = FALSE]
@@ -128,8 +128,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
   # G(t): KM estimate of the censoring distribution
   cens = matrix(c(cens$time, cens$surv), ncol = 2L)
 
-  # filter G(t) time points based on `t_max` cutoff
-  if (tmax_apply) {
+  # filter G(t) time points based on `t_max` cutoff for ISBS
+  if (tmax_apply && proper) {
     cens = cens[cens[, 1L] <= t_max, , drop = FALSE]
   }
 

From c4eb657ef00526c6da88bfb53c08d8c1bd4714d2 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 11 Dec 2024 23:32:56 +0100
Subject: [PATCH 03/24] fix test

---
 tests/testthat/test_mlr_measures.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R
index c40630bf5..e04abf5eb 100644
--- a/tests/testthat/test_mlr_measures.R
+++ b/tests/testthat/test_mlr_measures.R
@@ -170,8 +170,8 @@ test_that("graf: t_max, p_max, times", {
 
   # different time points considered
   expect_true(m0 != m1)
-  # same time points are used, but `t_max` also removes observations
-  expect_true(m1 != m2)
+  # same time points are used (`t_max` does NOT remove observations)
+  expect_equal(m1, m2)
   # different `t_max` => different time points used
   expect_true(m2 != m3)
   # different `t_max` but after the max evaluation time point, so result stays the same

From b297f7996beb4174c60f4aa3669108531c37603e Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 10:21:13 +0100
Subject: [PATCH 04/24] small comment fix

---
 R/integrated_scores.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/integrated_scores.R b/R/integrated_scores.R
index eeda03f6b..018b372cd 100644
--- a/R/integrated_scores.R
+++ b/R/integrated_scores.R
@@ -128,7 +128,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
   # G(t): KM estimate of the censoring distribution
   cens = matrix(c(cens$time, cens$surv), ncol = 2L)
 
-  # filter G(t) time points based on `t_max` cutoff for ISBS
+  # filter G(t) time points based on `t_max` cutoff for RISBS
   if (tmax_apply && proper) {
     cens = cens[cens[, 1L] <= t_max, , drop = FALSE]
   }

From 565b31a4a2e28b362a6433d897b66690b7fd307c Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 11:15:11 +0100
Subject: [PATCH 05/24] add scoring rules paper

---
 R/bibentries.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/R/bibentries.R b/R/bibentries.R
index 7b4dccd19..779e04438 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -741,5 +741,14 @@ bibentries = c( # nolint start
     title = "Simulating Survival Data Using the simsurv R Package",
     volume = "97",
     year = "2021"
+  ),
+  sonabend2024 = bibentry("misc",
+    archivePrefix = "arXiv",
+    arxivId = "2212.05260",
+    author = "Sonabend, Raphael and Zobolas, John and Kopper, Philipp and Burk, Lukas and Bender, Andreas",
+    month = "dec",
+    title = "Examining properness in the external validation of survival models with squared and logarithmic losses",
+    url = "https://arxiv.org/abs/2212.05260v2",
+    year = "2024"
   )
 )

From 7dfcc610caa8fdcd7139b74de291a8685159c414 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 11:22:20 +0100
Subject: [PATCH 06/24] update measure docs

---
 R/MeasureSurvGraf.R                 |  6 ++--
 R/MeasureSurvIntLogloss.R           |  6 ++--
 R/MeasureSurvLogloss.R              |  5 ++-
 R/MeasureSurvSchmid.R               | 13 ++------
 man-roxygen/details_tmax.R          | 14 ++++-----
 man-roxygen/details_trainG.R        | 12 ++++---
 man-roxygen/param_proper.R          |  4 ++-
 man-roxygen/param_tmax.R            |  4 +--
 man-roxygen/properness.R            |  4 +++
 man/mlr_measures_surv.graf.Rd       | 43 +++++++++++++++----------
 man/mlr_measures_surv.intlogloss.Rd | 43 +++++++++++++++----------
 man/mlr_measures_surv.logloss.Rd    | 18 ++++++++---
 man/mlr_measures_surv.schmid.Rd     | 49 ++++++++++++++++-------------
 13 files changed, 131 insertions(+), 90 deletions(-)

diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R
index c062b495e..53d5b5c18 100644
--- a/R/MeasureSurvGraf.R
+++ b/R/MeasureSurvGraf.R
@@ -25,13 +25,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISBS** (RISBS) is:
 #'
-#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -48,7 +48,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999")`
+#' `r format_bib("graf_1999", "sonabend2024")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R
index 3768f2a9c..138626317 100644
--- a/R/MeasureSurvIntLogloss.R
+++ b/R/MeasureSurvIntLogloss.R
@@ -23,13 +23,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISLL** (RISLL) is:
 #'
-#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -46,7 +46,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999")`
+#' `r format_bib("graf_1999", "sonabend2024")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
diff --git a/R/MeasureSurvLogloss.R b/R/MeasureSurvLogloss.R
index ceb57dc68..a510c649e 100644
--- a/R/MeasureSurvLogloss.R
+++ b/R/MeasureSurvLogloss.R
@@ -10,7 +10,7 @@
 #' Calculates the cross-entropy, or negative log-likelihood (NLL) or logarithmic (log), loss.
 #' @section Parameter details:
 #' - `IPCW` (`logical(1)`)\cr
-#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper).
+#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details.
 #'
 #' @details
 #' The Log Loss, in the context of probabilistic predictions, is defined as the
@@ -33,6 +33,9 @@
 #'
 #' @template details_trainG
 #'
+#' @references
+#' `r format_bib("sonabend2024")`
+#'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
 #' @export
diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R
index 22c574a91..2fd886def 100644
--- a/R/MeasureSurvSchmid.R
+++ b/R/MeasureSurvSchmid.R
@@ -22,13 +22,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISS** (RISS) is:
 #'
-#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -36,13 +36,6 @@
 #' return the average of the time-integrated observation-wise scores:
 #' \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N}
 #'
-#'
-#' \deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]}
-#' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
-#'
-#' The re-weighted ISS, RISS is given by
-#' \deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]}
-#'
 #' @template properness
 #' @templateVar improper_id ISS
 #' @templateVar proper_id RISS
@@ -52,7 +45,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("schemper_2000", "schmid_2011")`
+#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R
index 0d8fc023f..5e9065e37 100644
--- a/man-roxygen/details_tmax.R
+++ b/man-roxygen/details_tmax.R
@@ -1,9 +1,9 @@
 #' @section Time Cutoff Details:
 #'
-#' If `t_max` or `p_max` is given, then \eqn{G(t)} will be fitted using **all observations** from the
-#' train set (or test set) and only then the cutoff time will be applied.
-#' This is to ensure that more data is used for fitting the censoring distribution via the
-#' Kaplan-Meier.
-#' Setting the `t_max` can help alleviate inflation of the score when `proper` is `TRUE`,
-#' in cases where an observation is censored at the last observed time point.
-#' This results in \eqn{G(t_{max}) = 0} and the use of `eps` instead (when `t_max` is `NULL`).
+#' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is
+#' filtered up to the time cutoff for all observations.
+#' Also, when `proper = TRUE`, \eqn{G(t)} will be filtered up to the cutoff time as well.
+#' This helps alleviate inflation of the score in cases where an observation is
+#' censored at the last observed time point and no time cutoff is given, which results in
+#' \eqn{G(t_{max}) = 0} and the use of `eps` instead.
+#'
diff --git a/man-roxygen/details_trainG.R b/man-roxygen/details_trainG.R
index 2a131c29c..8c19e2a6c 100644
--- a/man-roxygen/details_trainG.R
+++ b/man-roxygen/details_trainG.R
@@ -1,6 +1,10 @@
 #' @section Data used for Estimating Censoring Distribution:
 #'
-#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit on training data,
-#' otherwise testing data. The first is likely to reduce any bias caused by calculating
-#' parts of the measure on the test data it is evaluating. The training data is automatically
-#' used in scoring resamplings.
+#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit using
+#' **all observations** from the train set, otherwise the test set is used.
+#' Using the train set is likely to reduce any bias caused by calculating parts of the
+#' measure on the test data it is evaluating.
+#' Also usually it means that more data is used for fitting the censoring
+#' distribution \eqn{G(t)} via the Kaplan-Meier.
+#' The training data is automatically used in scoring resamplings.
+#'
diff --git a/man-roxygen/param_proper.R b/man-roxygen/param_proper.R
index 1e130026b..84205f74e 100644
--- a/man-roxygen/param_proper.R
+++ b/man-roxygen/param_proper.R
@@ -3,6 +3,8 @@
 #'  If `TRUE` then weights scores by the censoring distribution at
 #'  the observed event time, which results in a strictly proper scoring
 #'  rule if censoring and survival time distributions are independent
-#'  and a sufficiently large dataset is used.
+#'  and a sufficiently large dataset is used, see Sonabend et al. (2024).
 #'  If `FALSE` then weights scores by the Graf method which is the
 #'  more common usage but the loss is not proper.
+#'  See "Properness" section for more details.
+#'
diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R
index 268605756..7f8761de7 100644
--- a/man-roxygen/param_tmax.R
+++ b/man-roxygen/param_tmax.R
@@ -3,6 +3,6 @@
 #' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 #' Mutually exclusive with `p_max` or `times`.
 #' This will effectively remove test observations for which the observed time
-#' (event or censoring) is strictly more than `t_max`.
-#' It's recommended to set `t_max` to avoid division by `eps`, see Details.
+#' (event or censoring) is strictly more than `t_max` in the case of `proper = TRUE`.
+#' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section.
 #' If `t_max` is not specified, an `Inf` time horizon is assumed.
diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R
index cb1c6d8f0..d4b149149 100644
--- a/man-roxygen/properness.R
+++ b/man-roxygen/properness.R
@@ -1,4 +1,5 @@
 #' @section Properness:
+#' `r lifecycle::badge("experimental")`
 #'
 #' <%=proper_id%> is strictly proper when the censoring distribution is independent
 #' of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -7,3 +8,6 @@
 #' Results may be very different if many observations are censored at the last
 #' observed time due to division by \eqn{1/eps} in `proper = TRUE`.
 #'
+#' See Sonabend et al. (2024) for more details.
+#' The use of `proper = TRUE` should be used with caution and it still considered as an experimental metric.
+#'
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index 4d2924dfc..af2ba6f01 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -17,13 +17,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISBS} (RISBS) is:
 
-\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -88,8 +88,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
 This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -126,9 +126,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -150,6 +151,7 @@ Default is \code{FALSE}.
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISBS is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -157,6 +159,9 @@ ISBS is never proper. Use \code{proper = FALSE} for ISBS and
 \code{proper = TRUE} for RISBS.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
 }
 
 \section{Time points used for evaluation}{
@@ -196,22 +201,24 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+filtered up to the time cutoff for all observations.
+Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
+This helps alleviate inflation of the score in cases where an observation is
+censored at the last observed time point and no time cutoff is given, which results in
+\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
 }
 
 \references{
@@ -219,6 +226,10 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
 \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545.
 \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index 5a6d7af85..36fcb8b56 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -15,13 +15,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISLL} (RISLL) is:
 
-\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -86,8 +86,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
 This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -124,9 +124,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -148,6 +149,7 @@ Default is \code{FALSE}.
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISLL is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -155,6 +157,9 @@ ISLL is never proper. Use \code{proper = FALSE} for ISLL and
 \code{proper = TRUE} for RISLL.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
 }
 
 \section{Time points used for evaluation}{
@@ -194,22 +199,24 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+filtered up to the time cutoff for all observations.
+Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
+This helps alleviate inflation of the score in cases where an observation is
+censored at the last observed time point and no time cutoff is given, which results in
+\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
 }
 
 \references{
@@ -217,6 +224,10 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
 \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545.
 \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.logloss.Rd b/man/mlr_measures_surv.logloss.Rd
index 245364758..bc7241647 100644
--- a/man/mlr_measures_surv.logloss.Rd
+++ b/man/mlr_measures_surv.logloss.Rd
@@ -86,19 +86,27 @@ Default is \code{FALSE}.
 
 \itemize{
 \item \code{IPCW} (\code{logical(1)})\cr
-If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper).
+If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details.
 }
 }
 
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
+\references{
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
+}
 \seealso{
 Other survival measures: 
 \code{\link{mlr_measures_surv.calib_alpha}},
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index d01cd26ea..ff64b953b 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -14,25 +14,19 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISS} (RISS) is:
 
-\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
 To get a single score across all \eqn{N} observations of the test set, we
 return the average of the time-integrated observation-wise scores:
 \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N}
-
-\deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]}
-where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
-
-The re-weighted ISS, RISS is given by
-\deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]}
 }
 \section{Dictionary}{
 
@@ -91,8 +85,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
 This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -129,9 +123,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -153,6 +148,7 @@ Default is \code{FALSE}.
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISS is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -160,6 +156,9 @@ ISS is never proper. Use \code{proper = FALSE} for ISS and
 \code{proper = TRUE} for RISS.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
 }
 
 \section{Time points used for evaluation}{
@@ -199,22 +198,24 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+filtered up to the time cutoff for all observations.
+Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
+This helps alleviate inflation of the score in cases where an observation is
+censored at the last observed time point and no time cutoff is given, which results in
+\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
 }
 
 \references{
@@ -227,6 +228,10 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011).
 \dQuote{A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error.}
 \emph{Biometrics}, \bold{67}(2), 524--535.
 \doi{10.1111/j.1541-0420.2010.01459.x}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
 }
 \seealso{
 Other survival measures: 

From e0bd34a1062914fce5a756603e2e256173113ed8 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 11:27:41 +0100
Subject: [PATCH 07/24] update news

---
 NEWS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NEWS.md b/NEWS.md
index 6ba93133a..81ebafbb2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,7 @@
 
 * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
 * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
+* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores
 
 # mlr3proba 0.7.0
 

From 30df58f94402869861d38a59e2685056c7997643 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 12:42:27 +0100
Subject: [PATCH 08/24] update news

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 81ebafbb2..8cb3f40e6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,7 +2,7 @@
 
 * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
 * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
-* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores
+* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores. Updated scoring rules formulas and docs accordingly.
 
 # mlr3proba 0.7.0
 

From a2ed3caba98290a3095a3af2a446380719e5a700 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Thu, 12 Dec 2024 13:08:06 +0100
Subject: [PATCH 09/24] better doc for d-calibration measure

---
 R/MeasureSurvDCalibration.R     | 19 +++++++++++--------
 man/mlr_measures_surv.dcalib.Rd | 19 +++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/R/MeasureSurvDCalibration.R b/R/MeasureSurvDCalibration.R
index cc0ad9ae7..053afde95 100644
--- a/R/MeasureSurvDCalibration.R
+++ b/R/MeasureSurvDCalibration.R
@@ -3,6 +3,8 @@
 #' @templateVar fullname MeasureSurvDCalibration
 #'
 #' @description
+#' `r lifecycle::badge("experimental")`
+#'
 #' This calibration method is defined by calculating the following statistic:
 #' \deqn{s = B/n \sum_i (P_i - n/B)^2}
 #' where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals),
@@ -12,8 +14,8 @@
 #' falls within the corresponding interval.
 #' This statistic assumes that censoring time is independent of death time.
 #'
-#' A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test`
-#'  (\eqn{p > 0.05} if well-calibrated).
+#' A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test`
+#'  (\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred).
 #' Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)},
 #' meaning that *lower values* of this measure are preferred.
 #'
@@ -23,7 +25,7 @@
 #' is well-calibrated. If `chisq = FALSE` and `s` is the predicted value then you can manually
 #' compute the p.value with `pchisq(s, B - 1, lower.tail = FALSE)`.
 #'
-#' NOTE: This measure is still experimental both theoretically and in implementation. Results
+#' **NOTE**: This measure is still experimental both theoretically and in implementation. Results
 #' should therefore only be taken as an indicator of performance and not for
 #' conclusive judgements about model calibration.
 #'
@@ -38,11 +40,12 @@
 #' You can manually get the p-value by executing `pchisq(s, B - 1, lower.tail = FALSE)`.
 #' The null hypothesis is that the model is D-calibrated.
 #' - `truncate` (`double(1)`) \cr
-#' This parameter controls the upper bound of the output statistic,
-#' when `chisq` is `FALSE`. We use `truncate = Inf` by default but \eqn{10} may be sufficient
-#' for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using
-#' \eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus
-#' less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to
+#' This parameter controls the upper bound of the output statistic, when `chisq` is `FALSE`.
+#' We use `truncate = Inf` by default but values between \eqn{10-16} are sufficient
+#' for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the `chisq.test` using
+#' the default \eqn{B = 10} buckets.
+#' Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models.
+#' If the number of buckets \eqn{B} changes, you probably will want to
 #' change the `truncate` value as well to correspond to the same p-value significance.
 #' Note that truncation may severely limit automated tuning with this measure.
 #'
diff --git a/man/mlr_measures_surv.dcalib.Rd b/man/mlr_measures_surv.dcalib.Rd
index e0a7779e0..d096a8c35 100644
--- a/man/mlr_measures_surv.dcalib.Rd
+++ b/man/mlr_measures_surv.dcalib.Rd
@@ -5,6 +5,8 @@
 \alias{MeasureSurvDCalibration}
 \title{D-Calibration Survival Measure}
 \description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
 This calibration method is defined by calculating the following statistic:
 \deqn{s = B/n \sum_i (P_i - n/B)^2}
 where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals),
@@ -14,8 +16,8 @@ of observations in the \eqn{i}th interval. An observation is assigned to the
 falls within the corresponding interval.
 This statistic assumes that censoring time is independent of death time.
 
-A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test}
-(\eqn{p > 0.05} if well-calibrated).
+A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test}
+(\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred).
 Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)},
 meaning that \emph{lower values} of this measure are preferred.
 }
@@ -25,7 +27,7 @@ The former is useful for model comparison whereas the latter is useful for deter
 is well-calibrated. If \code{chisq = FALSE} and \code{s} is the predicted value then you can manually
 compute the p.value with \code{pchisq(s, B - 1, lower.tail = FALSE)}.
 
-NOTE: This measure is still experimental both theoretically and in implementation. Results
+\strong{NOTE}: This measure is still experimental both theoretically and in implementation. Results
 should therefore only be taken as an indicator of performance and not for
 conclusive judgements about model calibration.
 }
@@ -72,11 +74,12 @@ Default is \code{FALSE} and returns the statistic \code{s}.
 You can manually get the p-value by executing \code{pchisq(s, B - 1, lower.tail = FALSE)}.
 The null hypothesis is that the model is D-calibrated.
 \item \code{truncate} (\code{double(1)}) \cr
-This parameter controls the upper bound of the output statistic,
-when \code{chisq} is \code{FALSE}. We use \code{truncate = Inf} by default but \eqn{10} may be sufficient
-for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using
-\eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus
-less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to
+This parameter controls the upper bound of the output statistic, when \code{chisq} is \code{FALSE}.
+We use \code{truncate = Inf} by default but values between \eqn{10-16} are sufficient
+for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the \code{chisq.test} using
+the default \eqn{B = 10} buckets.
+Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models.
+If the number of buckets \eqn{B} changes, you probably will want to
 change the \code{truncate} value as well to correspond to the same p-value significance.
 Note that truncation may severely limit automated tuning with this measure.
 }

From 06b1d891bf81f693313a8a25fac81f962bb27290 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 11:56:20 +0100
Subject: [PATCH 10/24] add Kvamme paper

---
 R/bibentries.R | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/R/bibentries.R b/R/bibentries.R
index 779e04438..1d670cffa 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -750,5 +750,16 @@ bibentries = c( # nolint start
     title = "Examining properness in the external validation of survival models with squared and logarithmic losses",
     url = "https://arxiv.org/abs/2212.05260v2",
     year = "2024"
+  ),
+  kvamme2023 = bibentry("article",
+    author = "Kvamme, Håvard and Borgan, Ørnulf",
+    issn = "1533-7928",
+    journal = "Journal of Machine Learning Research",
+    number = "2",
+    pages = "1--26",
+    title = "The Brier Score under Administrative Censoring: Problems and a Solution",
+    url = "http://jmlr.org/papers/v24/19-1030.html",
+    volume = "24",
+    year = "2023"
   )
 )

From bd196c3089510f6749bedadc292b59a8f9e6ea32 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:05:05 +0100
Subject: [PATCH 11/24] fix: always remove obs when t_max is applied + do not
 filter G(t)

---
 R/integrated_scores.R | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/R/integrated_scores.R b/R/integrated_scores.R
index 018b372cd..92b861ab1 100644
--- a/R/integrated_scores.R
+++ b/R/integrated_scores.R
@@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
     rownames(cdf) = unique_times # times x obs
   }
 
-  # apply `t_max` cutoff to remove observations for RISBS
-  if (tmax_apply && proper) {
+  # apply `t_max` cutoff to remove observations
+  if (tmax_apply) {
     true_times = test_times[test_times <= t_max]
     true_status = test_status[test_times <= t_max]
     cdf = cdf[, test_times <= t_max, drop = FALSE]
@@ -128,11 +128,6 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
   # G(t): KM estimate of the censoring distribution
   cens = matrix(c(cens$time, cens$surv), ncol = 2L)
 
-  # filter G(t) time points based on `t_max` cutoff for RISBS
-  if (tmax_apply && proper) {
-    cens = cens[cens[, 1L] <= t_max, , drop = FALSE]
-  }
-
   score = .c_weight_survival_score(score, true_truth, unique_times, cens, proper, eps)
   colnames(score) = unique_times
 

From a868c54fb7d3be26a7e8ec9b3957aa4290d8ff27 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:05:47 +0100
Subject: [PATCH 12/24] remove indicator function from score formulas + add
 citation

---
 R/MeasureSurvGraf.R       | 4 ++--
 R/MeasureSurvIntLogloss.R | 4 ++--
 R/MeasureSurvSchmid.R     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R
index 53d5b5c18..d44d72747 100644
--- a/R/MeasureSurvGraf.R
+++ b/R/MeasureSurvGraf.R
@@ -31,7 +31,7 @@
 #'
 #' The **re-weighted ISBS** (RISBS) is:
 #'
-#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -48,7 +48,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999", "sonabend2024")`
+#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R
index 138626317..66d74afd5 100644
--- a/R/MeasureSurvIntLogloss.R
+++ b/R/MeasureSurvIntLogloss.R
@@ -29,7 +29,7 @@
 #'
 #' The **re-weighted ISLL** (RISLL) is:
 #'
-#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -46,7 +46,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999", "sonabend2024")`
+#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R
index 2fd886def..119e3fef9 100644
--- a/R/MeasureSurvSchmid.R
+++ b/R/MeasureSurvSchmid.R
@@ -28,7 +28,7 @@
 #'
 #' The **re-weighted ISS** (RISS) is:
 #'
-#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -45,7 +45,7 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024")`
+#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures

From 1bd54df31e3327ca9378222ecdaca2a8394f59b2 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:06:24 +0100
Subject: [PATCH 13/24] doc update

---
 man-roxygen/details_tmax.R          | 10 +++++++---
 man-roxygen/param_tmax.R            |  4 ++--
 man-roxygen/properness.R            |  2 +-
 man/mlr_measures_surv.graf.Rd       | 23 ++++++++++++++++-------
 man/mlr_measures_surv.intlogloss.Rd | 23 ++++++++++++++++-------
 man/mlr_measures_surv.schmid.Rd     | 23 ++++++++++++++++-------
 6 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R
index 5e9065e37..12504bc80 100644
--- a/man-roxygen/details_tmax.R
+++ b/man-roxygen/details_tmax.R
@@ -2,8 +2,12 @@
 #'
 #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is
 #' filtered up to the time cutoff for all observations.
-#' Also, when `proper = TRUE`, \eqn{G(t)} will be filtered up to the cutoff time as well.
-#' This helps alleviate inflation of the score in cases where an observation is
+#' Also, **observations with observed times** \eqn{t > t_{max}} **are removed**.
+#' This is a data processing step to alleviate the problems that arise when using IPCW
+#' in cases of administrative censoring, see Kvamme et al. (2023).
+#' It also helps alleviate **inflation of the score** in cases where an observation is
 #' censored at the last observed time point and no time cutoff is given, which results in
-#' \eqn{G(t_{max}) = 0} and the use of `eps` instead.
+#' \eqn{G(t) = 0} and the use of `eps` instead.
+#' The proper version of this score is more affected by this issue, see Sonabend
+#' et al. (2024) for more details.
 #'
diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R
index 7f8761de7..c4a1c091c 100644
--- a/man-roxygen/param_tmax.R
+++ b/man-roxygen/param_tmax.R
@@ -2,7 +2,7 @@
 #' - `t_max` (`numeric(1)`)\cr
 #' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 #' Mutually exclusive with `p_max` or `times`.
-#' This will effectively remove test observations for which the observed time
-#' (event or censoring) is strictly more than `t_max` in the case of `proper = TRUE`.
+#' This will effectively **remove test observations** for which the observed time
+#' (event or censoring) is strictly more than `t_max`.
 #' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section.
 #' If `t_max` is not specified, an `Inf` time horizon is assumed.
diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R
index d4b149149..b0c956818 100644
--- a/man-roxygen/properness.R
+++ b/man-roxygen/properness.R
@@ -9,5 +9,5 @@
 #' observed time due to division by \eqn{1/eps} in `proper = TRUE`.
 #'
 #' See Sonabend et al. (2024) for more details.
-#' The use of `proper = TRUE` should be used with caution and it still considered as an experimental metric.
+#' The use of `proper = TRUE` is considered experimental and should be used with caution.
 #'
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index af2ba6f01..658adda48 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -23,7 +23,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISBS} (RISBS) is:
 
-\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -87,8 +87,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 \item \code{t_max} (\code{numeric(1)})\cr
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+This will effectively \strong{remove test observations} for which the observed time
+(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -161,7 +161,7 @@ Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
 
 See Sonabend et al. (2024) for more details.
-The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -215,10 +215,14 @@ The training data is automatically used in scoring resamplings.
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 filtered up to the time cutoff for all observations.
-Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
-This helps alleviate inflation of the score in cases where an observation is
+Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This is a data processing step to alleviate the problems that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+It also helps alleviate \strong{inflation of the score} in cases where an observation is
 censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
+\eqn{G(t) = 0} and the use of \code{eps} instead.
+The proper version of this score is more affected by this issue, see Sonabend
+et al. (2024) for more details.
 }
 
 \references{
@@ -230,6 +234,11 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Håvard, Borgan, Ørnulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index 36fcb8b56..d355521b0 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -21,7 +21,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISLL} (RISLL) is:
 
-\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -85,8 +85,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 \item \code{t_max} (\code{numeric(1)})\cr
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+This will effectively \strong{remove test observations} for which the observed time
+(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -159,7 +159,7 @@ Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
 
 See Sonabend et al. (2024) for more details.
-The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -213,10 +213,14 @@ The training data is automatically used in scoring resamplings.
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 filtered up to the time cutoff for all observations.
-Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
-This helps alleviate inflation of the score in cases where an observation is
+Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This is a data processing step to alleviate the problems that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+It also helps alleviate \strong{inflation of the score} in cases where an observation is
 censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
+\eqn{G(t) = 0} and the use of \code{eps} instead.
+The proper version of this score is more affected by this issue, see Sonabend
+et al. (2024) for more details.
 }
 
 \references{
@@ -228,6 +232,11 @@ Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Håvard, Borgan, Ørnulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index ff64b953b..4785d5605 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -20,7 +20,7 @@ where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISS} (RISS) is:
 
-\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
+\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -84,8 +84,8 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 \item \code{t_max} (\code{numeric(1)})\cr
 Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max} in the case of \code{proper = TRUE}.
+This will effectively \strong{remove test observations} for which the observed time
+(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -158,7 +158,7 @@ Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
 
 See Sonabend et al. (2024) for more details.
-The use of \code{proper = TRUE} should be used with caution and it still considered as an experimental metric.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -212,10 +212,14 @@ The training data is automatically used in scoring resamplings.
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 filtered up to the time cutoff for all observations.
-Also, when \code{proper = TRUE}, \eqn{G(t)} will be filtered up to the cutoff time as well.
-This helps alleviate inflation of the score in cases where an observation is
+Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This is a data processing step to alleviate the problems that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+It also helps alleviate \strong{inflation of the score} in cases where an observation is
 censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t_{max}) = 0} and the use of \code{eps} instead.
+\eqn{G(t) = 0} and the use of \code{eps} instead.
+The proper version of this score is more affected by this issue, see Sonabend
+et al. (2024) for more details.
 }
 
 \references{
@@ -232,6 +236,11 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011).
 Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Håvard, Borgan, Ørnulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 

From d76d0929e53263448b209bb1bc8e8e2a53d9a147 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:08:04 +0100
Subject: [PATCH 14/24] revert test

---
 tests/testthat/test_mlr_measures.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R
index e04abf5eb..c40630bf5 100644
--- a/tests/testthat/test_mlr_measures.R
+++ b/tests/testthat/test_mlr_measures.R
@@ -170,8 +170,8 @@ test_that("graf: t_max, p_max, times", {
 
   # different time points considered
   expect_true(m0 != m1)
-  # same time points are used (`t_max` does NOT remove observations)
-  expect_equal(m1, m2)
+  # same time points are used, but `t_max` also removes observations
+  expect_true(m1 != m2)
   # different `t_max` => different time points used
   expect_true(m2 != m3)
   # different `t_max` but after the max evaluation time point, so result stays the same

From 31928405c5d79256568b8aebe81b919e99a71425 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:13:51 +0100
Subject: [PATCH 15/24] update news

---
 NEWS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 8cb3f40e6..5370a0a74 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,7 +2,8 @@
 
 * Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
 * Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
-* Bug fix: `msr("surv.ibrier", proper = FALSE)` was removing observations when `t_max` or `p_max` was used, which resulted in a bit lower scores. Updated scoring rules formulas and docs accordingly.
+* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evalution at all)
+* docs: Clarified the use and impact of using `t_max` in scoring rules (removing observations as a processing step to alleviate IPCW issues).
 
 # mlr3proba 0.7.0
 

From c60f7c73f3199ca7375c4b80bf87fab2d2421dc0 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Fri, 13 Dec 2024 12:26:08 +0100
Subject: [PATCH 16/24] update norsk names

---
 R/bibentries.R                      | 2 +-
 man/mlr_measures_surv.graf.Rd       | 2 +-
 man/mlr_measures_surv.intlogloss.Rd | 2 +-
 man/mlr_measures_surv.schmid.Rd     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/bibentries.R b/R/bibentries.R
index 1d670cffa..f747e6ce1 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -752,7 +752,7 @@ bibentries = c( # nolint start
     year = "2024"
   ),
   kvamme2023 = bibentry("article",
-    author = "Kvamme, Håvard and Borgan, Ørnulf",
+    author = "Kvamme, Havard and Borgan, Ornulf",
     issn = "1533-7928",
     journal = "Journal of Machine Learning Research",
     number = "2",
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index 658adda48..fc1dce752 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -235,7 +235,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
 
-Kvamme, Håvard, Borgan, Ørnulf (2023).
+Kvamme, Havard, Borgan, Ornulf (2023).
 \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
 \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
 ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index d355521b0..ca46a278f 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -233,7 +233,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
 
-Kvamme, Håvard, Borgan, Ørnulf (2023).
+Kvamme, Havard, Borgan, Ornulf (2023).
 \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
 \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
 ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index 4785d5605..92edb33ec 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -237,7 +237,7 @@ Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas
 \dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
 \url{https://arxiv.org/abs/2212.05260v2}.
 
-Kvamme, Håvard, Borgan, Ørnulf (2023).
+Kvamme, Havard, Borgan, Ornulf (2023).
 \dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
 \emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
 ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.

From f1afdcb79fe46c44135bd7895e70368370e2041d Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Tue, 17 Dec 2024 20:21:15 +0100
Subject: [PATCH 17/24] add `remove_obs` parameter for scoring rules

---
 R/MeasureSurvGraf.R       | 8 +++++---
 R/MeasureSurvIntLogloss.R | 8 +++++---
 R/MeasureSurvSchmid.R     | 8 +++++---
 R/integrated_scores.R     | 7 ++++---
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R
index d44d72747..3bf36a881 100644
--- a/R/MeasureSurvGraf.R
+++ b/R/MeasureSurvGraf.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @aliases MeasureSurvBrier mlr_measures_surv.brier
 #'
@@ -73,11 +74,12 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -132,7 +134,7 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R
index 66d74afd5..7053e770b 100644
--- a/R/MeasureSurvIntLogloss.R
+++ b/R/MeasureSurvIntLogloss.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @description
 #' Calculates the **Integrated Survival Log-Likelihood** (ISLL) or Integrated
@@ -71,11 +72,12 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -130,7 +132,7 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R
index 119e3fef9..0b75e37a4 100644
--- a/R/MeasureSurvSchmid.R
+++ b/R/MeasureSurvSchmid.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @description
 #' Calculates the **Integrated Schmid Score** (ISS), aka integrated absolute loss.
@@ -70,11 +71,12 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -128,7 +130,7 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/integrated_scores.R b/R/integrated_scores.R
index 92b861ab1..2d7768e32 100644
--- a/R/integrated_scores.R
+++ b/R/integrated_scores.R
@@ -14,7 +14,7 @@ score_graf_schmid = function(true_times, unique_times, cdf, power = 2) {
 # - `t_max` > 0
 # - `p_max` in [0,1]
 weighted_survival_score = function(loss, truth, distribution, times = NULL,
-  t_max = NULL, p_max = NULL, proper, train = NULL, eps, ...) {
+  t_max = NULL, p_max = NULL, proper, train = NULL, eps, remove_obs = FALSE) {
   assert_surv(truth)
   # test set's (times, status)
   test_times = truth[, "time"]
@@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
     rownames(cdf) = unique_times # times x obs
   }
 
-  # apply `t_max` cutoff to remove observations
-  if (tmax_apply) {
+  # apply `t_max` cutoff to remove observations as a preprocessing step to alleviate inflation
+  if (tmax_apply && remove_obs) {
     true_times = test_times[test_times <= t_max]
     true_status = test_status[test_times <= t_max]
     cdf = cdf[, test_times <= t_max, drop = FALSE]
@@ -118,6 +118,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
 
   # use the `truth` (time, status) information from the train or test set
   if (is.null(train)) {
+    # no filtering of observations from test data: use ALL
     cens = survival::survfit(Surv(test_times, 1 - test_status) ~ 1)
   } else {
     # no filtering of observations from train data: use ALL

From 1bfb836df9ae7fc456d343c0ccb3475ae6f7be11 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Tue, 17 Dec 2024 20:50:02 +0100
Subject: [PATCH 18/24] update docs

---
 man-roxygen/details_tmax.R          | 25 ++++++++++++------
 man-roxygen/param_remove_obs.R      |  6 +++++
 man-roxygen/param_tmax.R            |  6 ++---
 man/mlr_measures_surv.graf.Rd       | 39 +++++++++++++++++++++--------
 man/mlr_measures_surv.intlogloss.Rd | 39 +++++++++++++++++++++--------
 man/mlr_measures_surv.schmid.Rd     | 39 +++++++++++++++++++++--------
 6 files changed, 110 insertions(+), 44 deletions(-)
 create mode 100644 man-roxygen/param_remove_obs.R

diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R
index 12504bc80..0086500a8 100644
--- a/man-roxygen/details_tmax.R
+++ b/man-roxygen/details_tmax.R
@@ -1,13 +1,22 @@
 #' @section Time Cutoff Details:
+#' `r lifecycle::badge("experimental")`
 #'
 #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is
-#' filtered up to the time cutoff for all observations.
-#' Also, **observations with observed times** \eqn{t > t_{max}} **are removed**.
-#' This is a data processing step to alleviate the problems that arise when using IPCW
+#' truncated at the time cutoff for all observations.
+#'
+#' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**.
+#' This data preprocessing step mitigates issues that arise when using IPCW
 #' in cases of administrative censoring, see Kvamme et al. (2023).
-#' It also helps alleviate **inflation of the score** in cases where an observation is
-#' censored at the last observed time point and no time cutoff is given, which results in
-#' \eqn{G(t) = 0} and the use of `eps` instead.
-#' The proper version of this score is more affected by this issue, see Sonabend
-#' et al. (2024) for more details.
+#' Practically, this step, along with setting a time cutoff `t_max`, helps mitigate
+#' the **inflation of the score** observed when an observation is censored at the
+#' final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+#' small constant `eps` instead.
+#' This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+#' for more details.
+#' Note that the `t_max` and `remove_obs` parameters do not affect the estimation
+#' of the censoring distribution, i.e. **always all the observations are used for estimating** \eqn{G(t)}.
+#'
+#' If `remove_obs = FALSE`, inflated scores may occur. While this aligns more closely
+#' with the definitions presented in the original papers, it can lead to misleading
+#' evaluation and poor optimization outcomes when using this score for model tuning.
 #'
diff --git a/man-roxygen/param_remove_obs.R b/man-roxygen/param_remove_obs.R
new file mode 100644
index 000000000..bd7c7d5ca
--- /dev/null
+++ b/man-roxygen/param_remove_obs.R
@@ -0,0 +1,6 @@
+#' @section Parameter details:
+#' - `remove_obs` (`logical(1)`)\cr
+#' Only effective when `t_max` or `p_max` is provided. Default is `FALSE`.
+#' If `TRUE`, then we **remove test observations** for which the observed time (event or censoring) is strictly larger than `t_max`.
+#' See "Time Cutoff Details" section for more details.
+#'
diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R
index c4a1c091c..a706e2442 100644
--- a/man-roxygen/param_tmax.R
+++ b/man-roxygen/param_tmax.R
@@ -1,8 +1,8 @@
 #' @section Parameter details:
 #' - `t_max` (`numeric(1)`)\cr
-#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+#' (truncate \eqn{S(t)}).
 #' Mutually exclusive with `p_max` or `times`.
-#' This will effectively **remove test observations** for which the observed time
-#' (event or censoring) is strictly more than `t_max`.
 #' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section.
 #' If `t_max` is not specified, an `Inf` time horizon is assumed.
+#'
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index fc1dce752..7f070941f 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -54,6 +54,7 @@ msr("surv.graf")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -85,10 +86,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively \strong{remove test observations} for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -147,6 +147,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
@@ -212,17 +220,26 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
-filtered up to the time cutoff for all observations.
-Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
-This is a data processing step to alleviate the problems that arise when using IPCW
+truncated at the time cutoff for all observations.
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).
-It also helps alleviate \strong{inflation of the score} in cases where an observation is
-censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t) = 0} and the use of \code{eps} instead.
-The proper version of this score is more affected by this issue, see Sonabend
-et al. (2024) for more details.
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
 \references{
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index ca46a278f..425d32869 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -52,6 +52,7 @@ msr("surv.intlogloss")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -83,10 +84,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively \strong{remove test observations} for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -145,6 +145,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
@@ -210,17 +218,26 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
-filtered up to the time cutoff for all observations.
-Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
-This is a data processing step to alleviate the problems that arise when using IPCW
+truncated at the time cutoff for all observations.
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).
-It also helps alleviate \strong{inflation of the score} in cases where an observation is
-censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t) = 0} and the use of \code{eps} instead.
-The proper version of this score is more affected by this issue, see Sonabend
-et al. (2024) for more details.
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
 \references{
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index 92edb33ec..ca25fe310 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -51,6 +51,7 @@ msr("surv.schmid")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -82,10 +83,9 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively \strong{remove test observations} for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
 It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
@@ -144,6 +144,14 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
@@ -209,17 +217,26 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
-filtered up to the time cutoff for all observations.
-Also, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
-This is a data processing step to alleviate the problems that arise when using IPCW
+truncated at the time cutoff for all observations.
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).
-It also helps alleviate \strong{inflation of the score} in cases where an observation is
-censored at the last observed time point and no time cutoff is given, which results in
-\eqn{G(t) = 0} and the use of \code{eps} instead.
-The proper version of this score is more affected by this issue, see Sonabend
-et al. (2024) for more details.
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
 \references{

From f290c968d86d8a78f06e764d3ec53a9201e696ac Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Tue, 17 Dec 2024 20:50:25 +0100
Subject: [PATCH 19/24] update tests

---
 tests/testthat/test_mlr_measures.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R
index c40630bf5..486e3fd3a 100644
--- a/tests/testthat/test_mlr_measures.R
+++ b/tests/testthat/test_mlr_measures.R
@@ -163,15 +163,18 @@ test_that("graf: t_max, p_max, times", {
   t_max = 100
   times_flt = times[times <= t_max] # keep only times until the `t_max`
   m0 = p$score(msr("surv.graf")) # uses all test time points
-  m1 = p$score(msr("surv.graf", times = times_flt)) # uses times_flt
+  m1 = p$score(msr("surv.graf", times = times_flt)) # uses `times_flt`
   m2 = p$score(msr("surv.graf", t_max = t_max)) # 100
+  m22 = p$score(msr("surv.graf", t_max = t_max, remove_obs = TRUE)) # 100
   m3 = p$score(msr("surv.graf", t_max = max(times))) # 104
-  m4 = p$score(msr("surv.graf", t_max = max(times) + 1)) # 105
+  m4 = p$score(msr("surv.graf", t_max = max(times) + 10)) # 105
 
   # different time points considered
   expect_true(m0 != m1)
-  # same time points are used, but `t_max` also removes observations
-  expect_true(m1 != m2)
+  # same time points are used, and no removal of observations (original Graf score)
+  expect_equal(m1, m2)
+  # same time points are used, but observations with `t > t_max` are removed
+  expect_true(m2 != m22)
   # different `t_max` => different time points used
   expect_true(m2 != m3)
   # different `t_max` but after the max evaluation time point, so result stays the same

From 37d11a70fdce66c455a546df51db9e9cc5dfcdbe Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Tue, 17 Dec 2024 20:51:52 +0100
Subject: [PATCH 20/24] update news

---
 NEWS.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5370a0a74..cf916ff88 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,11 @@
 # mlr3proba 0.7.1
 
-* Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
-* Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
-* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evalution at all)
-* docs: Clarified the use and impact of using `t_max` in scoring rules (removing observations as a processing step to alleviate IPCW issues).
+* cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
+* fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
+* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all)
+* docs: Clarified the use and impact of using `t_max` in scoring rules
+* feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues.
+This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition.
 
 # mlr3proba 0.7.0
 

From 1ddba16ad14895d104fde7483f6d5e0f0fc0b113 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 18 Dec 2024 18:31:04 +0100
Subject: [PATCH 21/24] change experimental badge position

---
 man-roxygen/details_tmax.R          | 3 ++-
 man/mlr_measures_surv.graf.Rd       | 3 ++-
 man/mlr_measures_surv.intlogloss.Rd | 3 ++-
 man/mlr_measures_surv.schmid.Rd     | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R
index 0086500a8..20425fc23 100644
--- a/man-roxygen/details_tmax.R
+++ b/man-roxygen/details_tmax.R
@@ -1,9 +1,10 @@
 #' @section Time Cutoff Details:
-#' `r lifecycle::badge("experimental")`
 #'
 #' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is
 #' truncated at the time cutoff for all observations.
 #'
+#' `r lifecycle::badge("experimental")`
+#'
 #' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**.
 #' This data preprocessing step mitigates issues that arise when using IPCW
 #' in cases of administrative censoring, see Kvamme et al. (2023).
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index 7f070941f..d2e35b91f 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -220,11 +220,12 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
-\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 truncated at the time cutoff for all observations.
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
 Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
 This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index 425d32869..45c3791b8 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -218,11 +218,12 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
-\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 truncated at the time cutoff for all observations.
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
 Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
 This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index ca25fe310..6adceed86 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -217,11 +217,12 @@ The training data is automatically used in scoring resamplings.
 
 \section{Time Cutoff Details}{
 
-\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
 truncated at the time cutoff for all observations.
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
 Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
 This data preprocessing step mitigates issues that arise when using IPCW
 in cases of administrative censoring, see Kvamme et al. (2023).

From 20aa3601b3426d06e5f1bcd89c99a4ac9a142891 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 18 Dec 2024 19:26:21 +0100
Subject: [PATCH 22/24] add examples for scoring rules

---
 R/MeasureSurvCindex.R               |  1 +
 R/MeasureSurvGraf.R                 |  1 +
 R/MeasureSurvIntLogloss.R           |  1 +
 R/MeasureSurvSchmid.R               |  1 +
 man-roxygen/example_scoring_rules.R | 45 +++++++++++++++++++++++++++++
 man/mlr_measures_surv.cindex.Rd     |  1 +
 man/mlr_measures_surv.graf.Rd       | 44 ++++++++++++++++++++++++++++
 man/mlr_measures_surv.intlogloss.Rd | 44 ++++++++++++++++++++++++++++
 man/mlr_measures_surv.schmid.Rd     | 44 ++++++++++++++++++++++++++++
 9 files changed, 182 insertions(+)
 create mode 100644 man-roxygen/example_scoring_rules.R

diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R
index 65cd8134c..2fbe08eb6 100644
--- a/R/MeasureSurvCindex.R
+++ b/R/MeasureSurvCindex.R
@@ -69,6 +69,7 @@
 #'
 #' # Harrell's C-index evaluated up to a specific time horizon
 #' p$score(msr("surv.cindex", t_max = 97))
+#'
 #' # Harrell's C-index evaluated up to the time corresponding to 30% of censoring
 #' p$score(msr("surv.cindex", p_max = 0.3))
 #'
diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R
index 3bf36a881..da5b577f7 100644
--- a/R/MeasureSurvGraf.R
+++ b/R/MeasureSurvGraf.R
@@ -53,6 +53,7 @@
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvGraf = R6Class("MeasureSurvGraf",
   inherit = MeasureSurv,
diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R
index 7053e770b..dc5e9492b 100644
--- a/R/MeasureSurvIntLogloss.R
+++ b/R/MeasureSurvIntLogloss.R
@@ -51,6 +51,7 @@
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
   inherit = MeasureSurv,
diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R
index 0b75e37a4..428fae41e 100644
--- a/R/MeasureSurvSchmid.R
+++ b/R/MeasureSurvSchmid.R
@@ -50,6 +50,7 @@
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
   inherit = MeasureSurv,
diff --git a/man-roxygen/example_scoring_rules.R b/man-roxygen/example_scoring_rules.R
new file mode 100644
index 000000000..4701385b4
--- /dev/null
+++ b/man-roxygen/example_scoring_rules.R
@@ -0,0 +1,45 @@
+#' <% measure = suppressWarnings(get(fullname)$new()) %>
+#'
+#' @examples
+#' library(mlr3)
+#'
+#' # Define a survival Task
+#' task = tsk("lung")
+#'
+#' # Create train and test set
+#' part = partition(task)
+#'
+#' # Train Cox learner on the train set
+#' cox = lrn("surv.coxph")
+#' cox$train(task, row_ids = part$train)
+#'
+#' # Make predictions for the test set
+#' p = cox$predict(task, row_ids = part$test)
+#'
+#' # <%=improper_id%>, G(t) calculated using the test set
+#' p$score(msr("<%=measure$id%>"))
+#'
+#' # <%=improper_id%>, G(t) calculated using the train set (always recommended)
+#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, ERV score (comparing with KM baseline)
+#' p$score(msr("<%=measure$id%>", ERV = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%> at specific time point
+#' p$score(msr("<%=measure$id%>", times = 365), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%> at multiple time points (integrated)
+#' p$score(msr("<%=measure$id%>", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff
+#' p$score(msr("<%=measure$id%>", t_max = 700), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff and also remove observations
+#' p$score(msr("<%=measure$id%>", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff corresponding to specific proportion of censoring on the test set
+#' p$score(msr("<%=measure$id%>", p_max = 0.8), task = task, train_set = part$train)
+#'
+#' # <%=proper_id%>, G(t) calculated using the train set
+#' p$score(msr("<%=measure$id%>", proper = TRUE), task = task, train_set = part$train)
+#'
diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd
index 5697b9764..9fba1e787 100644
--- a/man/mlr_measures_surv.cindex.Rd
+++ b/man/mlr_measures_surv.cindex.Rd
@@ -105,6 +105,7 @@ p$score(msr("surv.cindex", weight_meth = "G2"),
 
 # Harrell's C-index evaluated up to a specific time horizon
 p$score(msr("surv.cindex", t_max = 97))
+
 # Harrell's C-index evaluated up to the time corresponding to 30\% of censoring
 p$score(msr("surv.cindex", p_max = 0.3))
 
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index d2e35b91f..37b26f853 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -243,6 +243,50 @@ with the definitions presented in the original papers, it can lead to misleading
 evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISBS, G(t) calculated using the test set
+p$score(msr("surv.graf"))
+
+# ISBS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.graf"), task = task, train_set = part$train)
+
+# ISBS, ERV score (comparing with KM baseline)
+p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train)
+
+# ISBS at specific time point
+p$score(msr("surv.graf", times = 365), task = task, train_set = part$train)
+
+# ISBS at multiple time points (integrated)
+p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISBS, use time cutoff
+p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train)
+
+# ISBS, use time cutoff and also remove observations
+p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train)
+
+# RISBS, G(t) calculated using the train set
+p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index 45c3791b8..9498d433c 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -241,6 +241,50 @@ with the definitions presented in the original papers, it can lead to misleading
 evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISLL, G(t) calculated using the test set
+p$score(msr("surv.intlogloss"))
+
+# ISLL, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.intlogloss"), task = task, train_set = part$train)
+
+# ISLL, ERV score (comparing with KM baseline)
+p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train)
+
+# ISLL at specific time point
+p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train)
+
+# ISLL at multiple time points (integrated)
+p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISLL, use time cutoff
+p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train)
+
+# ISLL, use time cutoff and also remove observations
+p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train)
+
+# RISLL, G(t) calculated using the train set
+p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index 6adceed86..03723f90a 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -240,6 +240,50 @@ with the definitions presented in the original papers, it can lead to misleading
 evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISS, G(t) calculated using the test set
+p$score(msr("surv.schmid"))
+
+# ISS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.schmid"), task = task, train_set = part$train)
+
+# ISS, ERV score (comparing with KM baseline)
+p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train)
+
+# ISS at specific time point
+p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train)
+
+# ISS at multiple time points (integrated)
+p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISS, use time cutoff
+p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train)
+
+# ISS, use time cutoff and also remove observations
+p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train)
+
+# RISS, G(t) calculated using the train set
+p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Schemper, Michael, Henderson, Robin (2000).
 \dQuote{Predictive Accuracy and Explained Variation in Cox Regression.}

From 14cf2b282c4a18eb783632983b28747e851f5e40 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 18 Dec 2024 20:05:21 +0100
Subject: [PATCH 23/24] add examples for AUC measures

---
 R/MeasureSurvChamblessAUC.R            |  1 +
 R/MeasureSurvHungAUC.R                 |  1 +
 R/MeasureSurvSongAUC.R                 |  1 +
 R/MeasureSurvUnoAUC.R                  |  1 +
 man-roxygen/example_auc_measures.R     | 27 ++++++++++++++++++++++++++
 man/mlr_measures_surv.chambless_auc.Rd | 26 +++++++++++++++++++++++++
 man/mlr_measures_surv.hung_auc.Rd      | 26 +++++++++++++++++++++++++
 man/mlr_measures_surv.song_auc.Rd      | 26 +++++++++++++++++++++++++
 man/mlr_measures_surv.uno_auc.Rd       | 26 +++++++++++++++++++++++++
 9 files changed, 135 insertions(+)
 create mode 100644 man-roxygen/example_auc_measures.R

diff --git a/R/MeasureSurvChamblessAUC.R b/R/MeasureSurvChamblessAUC.R
index 15071a89d..3d469ac62 100644
--- a/R/MeasureSurvChamblessAUC.R
+++ b/R/MeasureSurvChamblessAUC.R
@@ -15,6 +15,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvChamblessAUC = R6Class("MeasureSurvChamblessAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvHungAUC.R b/R/MeasureSurvHungAUC.R
index 91141cd9b..c8cf41ce1 100644
--- a/R/MeasureSurvHungAUC.R
+++ b/R/MeasureSurvHungAUC.R
@@ -15,6 +15,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvHungAUC = R6Class("MeasureSurvHungAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvSongAUC.R b/R/MeasureSurvSongAUC.R
index 401a18883..ee79ec939 100644
--- a/R/MeasureSurvSongAUC.R
+++ b/R/MeasureSurvSongAUC.R
@@ -16,6 +16,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvSongAUC = R6Class("MeasureSurvSongAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvUnoAUC.R b/R/MeasureSurvUnoAUC.R
index b1565944f..9c5c3ffe4 100644
--- a/R/MeasureSurvUnoAUC.R
+++ b/R/MeasureSurvUnoAUC.R
@@ -16,6 +16,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvUnoAUC = R6Class("MeasureSurvUnoAUC",
   inherit = MeasureSurvAUC,
diff --git a/man-roxygen/example_auc_measures.R b/man-roxygen/example_auc_measures.R
new file mode 100644
index 000000000..c229fa965
--- /dev/null
+++ b/man-roxygen/example_auc_measures.R
@@ -0,0 +1,27 @@
+#' <% measure = suppressWarnings(get(fullname)$new()) %>
+#'
+#' @examples
+#' library(mlr3)
+#'
+#' # Define a survival Task
+#' task = tsk("lung")
+#'
+#' # Create train and test set
+#' part = partition(task)
+#'
+#' # Train Cox learner on the train set
+#' cox = lrn("surv.coxph")
+#' cox$train(task, row_ids = part$train)
+#'
+#' # Make predictions for the test set
+#' p = cox$predict(task, row_ids = part$test)
+#'
+#' # Integrated AUC score
+#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train, learner = cox)
+#'
+#' # AUC at specific time point
+#' p$score(msr("<%=measure$id%>", times = 600), task = task, train_set = part$train, learner = cox)
+#'
+#' # Integrated AUC at specific time points
+#' p$score(msr("<%=measure$id%>", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#'
diff --git a/man/mlr_measures_surv.chambless_auc.Rd b/man/mlr_measures_surv.chambless_auc.Rd
index 6b8476ace..88e68bca9 100644
--- a/man/mlr_measures_surv.chambless_auc.Rd
+++ b/man/mlr_measures_surv.chambless_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Chambless LE, Diao G (2006).
 \dQuote{Estimation of time-dependent area under the ROC curve for long-term risk prediction.}
diff --git a/man/mlr_measures_surv.hung_auc.Rd b/man/mlr_measures_surv.hung_auc.Rd
index e5207db4e..3f05b93ce 100644
--- a/man/mlr_measures_surv.hung_auc.Rd
+++ b/man/mlr_measures_surv.hung_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Hung H, Chiang C (2010).
 \dQuote{Estimation methods for time-dependent AUC models with survival data.}
diff --git a/man/mlr_measures_surv.song_auc.Rd b/man/mlr_measures_surv.song_auc.Rd
index f57df5545..deceab503 100644
--- a/man/mlr_measures_surv.song_auc.Rd
+++ b/man/mlr_measures_surv.song_auc.Rd
@@ -69,6 +69,32 @@ incident TPR, \code{cumulative} refers to cumulative TPR.
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Song, Xiao, Zhou, Xiao-Hua (2008).
 \dQuote{A semiparametric approach for the covariate specific ROC curve with survival outcome.}
diff --git a/man/mlr_measures_surv.uno_auc.Rd b/man/mlr_measures_surv.uno_auc.Rd
index 5536ad7ad..b4206ac29 100644
--- a/man/mlr_measures_surv.uno_auc.Rd
+++ b/man/mlr_measures_surv.uno_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Uno H, Cai T, Tian L, Wei LJ (2007).
 \dQuote{Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models.}

From e3766e80367e45ad922438620130713b7b317d3d Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 18 Dec 2024 20:07:50 +0100
Subject: [PATCH 24/24] update news

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index cf916ff88..375a4d6ac 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,7 +3,7 @@
 * cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
 * fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
 * fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all)
-* docs: Clarified the use and impact of using `t_max` in scoring rules
+* docs: Clarified the use and impact of using `t_max` in scoring rules, added examples in scoring rules and AUC scores
 * feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues.
 This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition.