diff --git a/NEWS.md b/NEWS.md
index 6ba93133a..375a4d6ac 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,11 @@
 # mlr3proba 0.7.1
 
-* Removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
-* Bug fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
+* cleanup: removed all `PipeOp`s and pipelines related to survival => regression reduction techniques (see #414)
+* fix: `$predict_type` of `survtoclassif_disctime` and `survtoclassif_IPCW` was `prob` (classification type) and not `crank` (survival type)
+* fix: G(t) is not filtered when `t_max|p_max` is specified in scoring rules (didn't influence evaluation at all)
+* docs: Clarified the use and impact of using `t_max` in scoring rules, added examples in scoring rules and AUC scores
+* feat: Added new argument `remove_obs` in scoring rules to remove observations with observed time `t > t_max` as a processing step to alleviate IPCW issues.
+This was before 'hard-coded' which made the Integrated Brier Score (`msr("surv.graf")`) differ minimally from other implementations and the original definition.
 
 # mlr3proba 0.7.0
 
diff --git a/R/MeasureSurvChamblessAUC.R b/R/MeasureSurvChamblessAUC.R
index 15071a89d..3d469ac62 100644
--- a/R/MeasureSurvChamblessAUC.R
+++ b/R/MeasureSurvChamblessAUC.R
@@ -15,6 +15,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvChamblessAUC = R6Class("MeasureSurvChamblessAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R
index 65cd8134c..2fbe08eb6 100644
--- a/R/MeasureSurvCindex.R
+++ b/R/MeasureSurvCindex.R
@@ -69,6 +69,7 @@
 #'
 #' # Harrell's C-index evaluated up to a specific time horizon
 #' p$score(msr("surv.cindex", t_max = 97))
+#'
 #' # Harrell's C-index evaluated up to the time corresponding to 30% of censoring
 #' p$score(msr("surv.cindex", p_max = 0.3))
 #'
diff --git a/R/MeasureSurvDCalibration.R b/R/MeasureSurvDCalibration.R
index cc0ad9ae7..053afde95 100644
--- a/R/MeasureSurvDCalibration.R
+++ b/R/MeasureSurvDCalibration.R
@@ -3,6 +3,8 @@
 #' @templateVar fullname MeasureSurvDCalibration
 #'
 #' @description
+#' `r lifecycle::badge("experimental")`
+#'
 #' This calibration method is defined by calculating the following statistic:
 #' \deqn{s = B/n \sum_i (P_i - n/B)^2}
 #' where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals),
@@ -12,8 +14,8 @@
 #' falls within the corresponding interval.
 #' This statistic assumes that censoring time is independent of death time.
 #'
-#' A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test`
-#'  (\eqn{p > 0.05} if well-calibrated).
+#' A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with `chisq.test`
+#'  (\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred).
 #' Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)},
 #' meaning that *lower values* of this measure are preferred.
 #'
@@ -23,7 +25,7 @@
 #' is well-calibrated. If `chisq = FALSE` and `s` is the predicted value then you can manually
 #' compute the p.value with `pchisq(s, B - 1, lower.tail = FALSE)`.
 #'
-#' NOTE: This measure is still experimental both theoretically and in implementation. Results
+#' **NOTE**: This measure is still experimental both theoretically and in implementation. Results
 #' should therefore only be taken as an indicator of performance and not for
 #' conclusive judgements about model calibration.
 #'
@@ -38,11 +40,12 @@
 #' You can manually get the p-value by executing `pchisq(s, B - 1, lower.tail = FALSE)`.
 #' The null hypothesis is that the model is D-calibrated.
 #' - `truncate` (`double(1)`) \cr
-#' This parameter controls the upper bound of the output statistic,
-#' when `chisq` is `FALSE`. We use `truncate = Inf` by default but \eqn{10} may be sufficient
-#' for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using
-#' \eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus
-#' less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to
+#' This parameter controls the upper bound of the output statistic, when `chisq` is `FALSE`.
+#' We use `truncate = Inf` by default but values between \eqn{10-16} are sufficient
+#' for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the `chisq.test` using
+#' the default \eqn{B = 10} buckets.
+#' Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models.
+#' If the number of buckets \eqn{B} changes, you probably will want to
 #' change the `truncate` value as well to correspond to the same p-value significance.
 #' Note that truncation may severely limit automated tuning with this measure.
 #'
diff --git a/R/MeasureSurvGraf.R b/R/MeasureSurvGraf.R
index c062b495e..da5b577f7 100644
--- a/R/MeasureSurvGraf.R
+++ b/R/MeasureSurvGraf.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @aliases MeasureSurvBrier mlr_measures_surv.brier
 #'
@@ -25,13 +26,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISBS** (RISBS) is:
 #'
-#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -48,10 +49,11 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999")`
+#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvGraf = R6Class("MeasureSurvGraf",
   inherit = MeasureSurv,
@@ -73,11 +75,12 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -132,7 +135,7 @@ MeasureSurvGraf = R6Class("MeasureSurvGraf",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/MeasureSurvHungAUC.R b/R/MeasureSurvHungAUC.R
index 91141cd9b..c8cf41ce1 100644
--- a/R/MeasureSurvHungAUC.R
+++ b/R/MeasureSurvHungAUC.R
@@ -15,6 +15,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvHungAUC = R6Class("MeasureSurvHungAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvIntLogloss.R b/R/MeasureSurvIntLogloss.R
index 3768f2a9c..dc5e9492b 100644
--- a/R/MeasureSurvIntLogloss.R
+++ b/R/MeasureSurvIntLogloss.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @description
 #' Calculates the **Integrated Survival Log-Likelihood** (ISLL) or Integrated
@@ -23,13 +24,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISLL** (RISLL) is:
 #'
-#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -46,10 +47,11 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("graf_1999")`
+#' `r format_bib("graf_1999", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
   inherit = MeasureSurv,
@@ -71,11 +73,12 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -130,7 +133,7 @@ MeasureSurvIntLogloss = R6Class("MeasureSurvIntLogloss",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/MeasureSurvLogloss.R b/R/MeasureSurvLogloss.R
index ceb57dc68..a510c649e 100644
--- a/R/MeasureSurvLogloss.R
+++ b/R/MeasureSurvLogloss.R
@@ -10,7 +10,7 @@
 #' Calculates the cross-entropy, or negative log-likelihood (NLL) or logarithmic (log), loss.
 #' @section Parameter details:
 #' - `IPCW` (`logical(1)`)\cr
-#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper).
+#' If `TRUE` (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details.
 #'
 #' @details
 #' The Log Loss, in the context of probabilistic predictions, is defined as the
@@ -33,6 +33,9 @@
 #'
 #' @template details_trainG
 #'
+#' @references
+#' `r format_bib("sonabend2024")`
+#'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
 #' @export
diff --git a/R/MeasureSurvSchmid.R b/R/MeasureSurvSchmid.R
index 22c574a91..428fae41e 100644
--- a/R/MeasureSurvSchmid.R
+++ b/R/MeasureSurvSchmid.R
@@ -11,6 +11,7 @@
 #' @templateVar eps 1e-3
 #' @template param_eps
 #' @template param_erv
+#' @template param_remove_obs
 #'
 #' @description
 #' Calculates the **Integrated Schmid Score** (ISS), aka integrated absolute loss.
@@ -22,13 +23,13 @@
 #' survival function \eqn{S_i(t)}, the *observation-wise* loss integrated across
 #' the time dimension up to the time cutoff \eqn{\tau^*}, is:
 #'
-#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+#' \deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 #'
 #' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 #'
 #' The **re-weighted ISS** (RISS) is:
 #'
-#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+#' \deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 #'
 #' which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 #'
@@ -36,13 +37,6 @@
 #' return the average of the time-integrated observation-wise scores:
 #' \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N}
 #'
-#'
-#' \deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]}
-#' where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
-#'
-#' The re-weighted ISS, RISS is given by
-#' \deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]}
-#'
 #' @template properness
 #' @templateVar improper_id ISS
 #' @templateVar proper_id RISS
@@ -52,10 +46,11 @@
 #' @template details_tmax
 #'
 #' @references
-#' `r format_bib("schemper_2000", "schmid_2011")`
+#' `r format_bib("schemper_2000", "schmid_2011", "sonabend2024", "kvamme2023")`
 #'
 #' @family Probabilistic survival measures
 #' @family distr survival measures
+#' @template example_scoring_rules
 #' @export
 MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
   inherit = MeasureSurv,
@@ -77,11 +72,12 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
         se = p_lgl(default = FALSE),
         proper = p_lgl(default = FALSE),
         eps = p_dbl(0, 1, default = 1e-3),
-        ERV = p_lgl(default = FALSE)
+        ERV = p_lgl(default = FALSE),
+        remove_obs = p_lgl(default = FALSE)
       )
       ps$set_values(
         integrated = TRUE, method = 2L, se = FALSE,
-        proper = FALSE, eps = 1e-3, ERV = ERV
+        proper = FALSE, eps = 1e-3, ERV = ERV, remove_obs = FALSE
       )
 
       range = if (ERV) c(-Inf, 1) else c(0, Inf)
@@ -135,7 +131,7 @@ MeasureSurvSchmid = R6Class("MeasureSurvSchmid",
         truth = prediction$truth,
         distribution = prediction$data$distr, times = times,
         t_max = ps$t_max, p_max = ps$p_max, proper = ps$proper, train = train,
-        eps = ps$eps
+        eps = ps$eps, remove_obs = ps$remove_obs
       )
 
       if (ps$se) {
diff --git a/R/MeasureSurvSongAUC.R b/R/MeasureSurvSongAUC.R
index 401a18883..ee79ec939 100644
--- a/R/MeasureSurvSongAUC.R
+++ b/R/MeasureSurvSongAUC.R
@@ -16,6 +16,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvSongAUC = R6Class("MeasureSurvSongAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/MeasureSurvUnoAUC.R b/R/MeasureSurvUnoAUC.R
index b1565944f..9c5c3ffe4 100644
--- a/R/MeasureSurvUnoAUC.R
+++ b/R/MeasureSurvUnoAUC.R
@@ -16,6 +16,7 @@
 #'
 #' @family AUC survival measures
 #' @family lp survival measures
+#' @template example_auc_measures
 #' @export
 MeasureSurvUnoAUC = R6Class("MeasureSurvUnoAUC",
   inherit = MeasureSurvAUC,
diff --git a/R/bibentries.R b/R/bibentries.R
index 7b4dccd19..f747e6ce1 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -741,5 +741,25 @@ bibentries = c( # nolint start
     title = "Simulating Survival Data Using the simsurv R Package",
     volume = "97",
     year = "2021"
+  ),
+  sonabend2024 = bibentry("misc",
+    archivePrefix = "arXiv",
+    arxivId = "2212.05260",
+    author = "Sonabend, Raphael and Zobolas, John and Kopper, Philipp and Burk, Lukas and Bender, Andreas",
+    month = "dec",
+    title = "Examining properness in the external validation of survival models with squared and logarithmic losses",
+    url = "https://arxiv.org/abs/2212.05260v2",
+    year = "2024"
+  ),
+  kvamme2023 = bibentry("article",
+    author = "Kvamme, Havard and Borgan, Ornulf",
+    issn = "1533-7928",
+    journal = "Journal of Machine Learning Research",
+    number = "2",
+    pages = "1--26",
+    title = "The Brier Score under Administrative Censoring: Problems and a Solution",
+    url = "http://jmlr.org/papers/v24/19-1030.html",
+    volume = "24",
+    year = "2023"
   )
 )
diff --git a/R/integrated_scores.R b/R/integrated_scores.R
index 2e6b1ba2d..2d7768e32 100644
--- a/R/integrated_scores.R
+++ b/R/integrated_scores.R
@@ -14,7 +14,7 @@ score_graf_schmid = function(true_times, unique_times, cdf, power = 2) {
 # - `t_max` > 0
 # - `p_max` in [0,1]
 weighted_survival_score = function(loss, truth, distribution, times = NULL,
-  t_max = NULL, p_max = NULL, proper, train = NULL, eps, ...) {
+  t_max = NULL, p_max = NULL, proper, train = NULL, eps, remove_obs = FALSE) {
   assert_surv(truth)
   # test set's (times, status)
   test_times = truth[, "time"]
@@ -90,8 +90,8 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
     rownames(cdf) = unique_times # times x obs
   }
 
-  # apply `t_max` cutoff to remove observations
-  if (tmax_apply) {
+  # apply `t_max` cutoff to remove observations as a preprocessing step to alleviate inflation
+  if (tmax_apply && remove_obs) {
     true_times = test_times[test_times <= t_max]
     true_status = test_status[test_times <= t_max]
     cdf = cdf[, test_times <= t_max, drop = FALSE]
@@ -118,6 +118,7 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
 
   # use the `truth` (time, status) information from the train or test set
   if (is.null(train)) {
+    # no filtering of observations from test data: use ALL
     cens = survival::survfit(Surv(test_times, 1 - test_status) ~ 1)
   } else {
     # no filtering of observations from train data: use ALL
@@ -128,11 +129,6 @@ weighted_survival_score = function(loss, truth, distribution, times = NULL,
   # G(t): KM estimate of the censoring distribution
   cens = matrix(c(cens$time, cens$surv), ncol = 2L)
 
-  # filter G(t) time points based on `t_max` cutoff
-  if (tmax_apply) {
-    cens = cens[cens[, 1L] <= t_max, , drop = FALSE]
-  }
-
   score = .c_weight_survival_score(score, true_truth, unique_times, cens, proper, eps)
   colnames(score) = unique_times
 
diff --git a/man-roxygen/details_tmax.R b/man-roxygen/details_tmax.R
index 0d8fc023f..20425fc23 100644
--- a/man-roxygen/details_tmax.R
+++ b/man-roxygen/details_tmax.R
@@ -1,9 +1,23 @@
 #' @section Time Cutoff Details:
 #'
-#' If `t_max` or `p_max` is given, then \eqn{G(t)} will be fitted using **all observations** from the
-#' train set (or test set) and only then the cutoff time will be applied.
-#' This is to ensure that more data is used for fitting the censoring distribution via the
-#' Kaplan-Meier.
-#' Setting the `t_max` can help alleviate inflation of the score when `proper` is `TRUE`,
-#' in cases where an observation is censored at the last observed time point.
-#' This results in \eqn{G(t_{max}) = 0} and the use of `eps` instead (when `t_max` is `NULL`).
+#' If `t_max` or `p_max` is given, then the predicted survival function \eqn{S(t)} is
+#' truncated at the time cutoff for all observations.
+#'
+#' `r lifecycle::badge("experimental")`
+#'
+#' Also, if `remove_obs = TRUE`, **observations with observed times** \eqn{t > t_{max}} **are removed**.
+#' This data preprocessing step mitigates issues that arise when using IPCW
+#' in cases of administrative censoring, see Kvamme et al. (2023).
+#' Practically, this step, along with setting a time cutoff `t_max`, helps mitigate
+#' the **inflation of the score** observed when an observation is censored at the
+#' final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+#' small constant `eps` instead.
+#' This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+#' for more details.
+#' Note that the `t_max` and `remove_obs` parameters do not affect the estimation
+#' of the censoring distribution, i.e. **always all the observations are used for estimating** \eqn{G(t)}.
+#'
+#' If `remove_obs = FALSE`, inflated scores may occur. While this aligns more closely
+#' with the definitions presented in the original papers, it can lead to misleading
+#' evaluation and poor optimization outcomes when using this score for model tuning.
+#'
diff --git a/man-roxygen/details_trainG.R b/man-roxygen/details_trainG.R
index 2a131c29c..8c19e2a6c 100644
--- a/man-roxygen/details_trainG.R
+++ b/man-roxygen/details_trainG.R
@@ -1,6 +1,10 @@
 #' @section Data used for Estimating Censoring Distribution:
 #'
-#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit on training data,
-#' otherwise testing data. The first is likely to reduce any bias caused by calculating
-#' parts of the measure on the test data it is evaluating. The training data is automatically
-#' used in scoring resamplings.
+#' If `task` and `train_set` are passed to `$score` then \eqn{G(t)} is fit using
+#' **all observations** from the train set, otherwise the test set is used.
+#' Using the train set is likely to reduce any bias caused by calculating parts of the
+#' measure on the test data it is evaluating.
+#' Also usually it means that more data is used for fitting the censoring
+#' distribution \eqn{G(t)} via the Kaplan-Meier.
+#' The training data is automatically used in scoring resamplings.
+#'
diff --git a/man-roxygen/example_auc_measures.R b/man-roxygen/example_auc_measures.R
new file mode 100644
index 000000000..c229fa965
--- /dev/null
+++ b/man-roxygen/example_auc_measures.R
@@ -0,0 +1,27 @@
+#' <% measure = suppressWarnings(get(fullname)$new()) %>
+#'
+#' @examples
+#' library(mlr3)
+#'
+#' # Define a survival Task
+#' task = tsk("lung")
+#'
+#' # Create train and test set
+#' part = partition(task)
+#'
+#' # Train Cox learner on the train set
+#' cox = lrn("surv.coxph")
+#' cox$train(task, row_ids = part$train)
+#'
+#' # Make predictions for the test set
+#' p = cox$predict(task, row_ids = part$test)
+#'
+#' # Integrated AUC score
+#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train, learner = cox)
+#'
+#' # AUC at specific time point
+#' p$score(msr("<%=measure$id%>", times = 600), task = task, train_set = part$train, learner = cox)
+#'
+#' # Integrated AUC at specific time points
+#' p$score(msr("<%=measure$id%>", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#'
diff --git a/man-roxygen/example_scoring_rules.R b/man-roxygen/example_scoring_rules.R
new file mode 100644
index 000000000..4701385b4
--- /dev/null
+++ b/man-roxygen/example_scoring_rules.R
@@ -0,0 +1,45 @@
+#' <% measure = suppressWarnings(get(fullname)$new()) %>
+#'
+#' @examples
+#' library(mlr3)
+#'
+#' # Define a survival Task
+#' task = tsk("lung")
+#'
+#' # Create train and test set
+#' part = partition(task)
+#'
+#' # Train Cox learner on the train set
+#' cox = lrn("surv.coxph")
+#' cox$train(task, row_ids = part$train)
+#'
+#' # Make predictions for the test set
+#' p = cox$predict(task, row_ids = part$test)
+#'
+#' # <%=improper_id%>, G(t) calculated using the test set
+#' p$score(msr("<%=measure$id%>"))
+#'
+#' # <%=improper_id%>, G(t) calculated using the train set (always recommended)
+#' p$score(msr("<%=measure$id%>"), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, ERV score (comparing with KM baseline)
+#' p$score(msr("<%=measure$id%>", ERV = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%> at specific time point
+#' p$score(msr("<%=measure$id%>", times = 365), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%> at multiple time points (integrated)
+#' p$score(msr("<%=measure$id%>", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff
+#' p$score(msr("<%=measure$id%>", t_max = 700), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff and also remove observations
+#' p$score(msr("<%=measure$id%>", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#'
+#' # <%=improper_id%>, use time cutoff corresponding to specific proportion of censoring on the test set
+#' p$score(msr("<%=measure$id%>", p_max = 0.8), task = task, train_set = part$train)
+#'
+#' # <%=proper_id%>, G(t) calculated using the train set
+#' p$score(msr("<%=measure$id%>", proper = TRUE), task = task, train_set = part$train)
+#'
diff --git a/man-roxygen/param_proper.R b/man-roxygen/param_proper.R
index 1e130026b..84205f74e 100644
--- a/man-roxygen/param_proper.R
+++ b/man-roxygen/param_proper.R
@@ -3,6 +3,8 @@
 #'  If `TRUE` then weights scores by the censoring distribution at
 #'  the observed event time, which results in a strictly proper scoring
 #'  rule if censoring and survival time distributions are independent
-#'  and a sufficiently large dataset is used.
+#'  and a sufficiently large dataset is used, see Sonabend et al. (2024).
 #'  If `FALSE` then weights scores by the Graf method which is the
 #'  more common usage but the loss is not proper.
+#'  See "Properness" section for more details.
+#'
diff --git a/man-roxygen/param_remove_obs.R b/man-roxygen/param_remove_obs.R
new file mode 100644
index 000000000..bd7c7d5ca
--- /dev/null
+++ b/man-roxygen/param_remove_obs.R
@@ -0,0 +1,6 @@
+#' @section Parameter details:
+#' - `remove_obs` (`logical(1)`)\cr
+#' Only effective when `t_max` or `p_max` is provided. Default is `FALSE`.
+#' If `TRUE`, then we **remove test observations** for which the observed time (event or censoring) is strictly larger than `t_max`.
+#' See "Time Cutoff Details" section for more details.
+#'
diff --git a/man-roxygen/param_tmax.R b/man-roxygen/param_tmax.R
index 268605756..a706e2442 100644
--- a/man-roxygen/param_tmax.R
+++ b/man-roxygen/param_tmax.R
@@ -1,8 +1,8 @@
 #' @section Parameter details:
 #' - `t_max` (`numeric(1)`)\cr
-#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+#' Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+#' (truncate \eqn{S(t)}).
 #' Mutually exclusive with `p_max` or `times`.
-#' This will effectively remove test observations for which the observed time
-#' (event or censoring) is strictly more than `t_max`.
-#' It's recommended to set `t_max` to avoid division by `eps`, see Details.
+#' It's recommended to set `t_max` to avoid division by `eps`, see "Time Cutoff Details" section.
 #' If `t_max` is not specified, an `Inf` time horizon is assumed.
+#'
diff --git a/man-roxygen/properness.R b/man-roxygen/properness.R
index cb1c6d8f0..b0c956818 100644
--- a/man-roxygen/properness.R
+++ b/man-roxygen/properness.R
@@ -1,4 +1,5 @@
 #' @section Properness:
+#' `r lifecycle::badge("experimental")`
 #'
 #' <%=proper_id%> is strictly proper when the censoring distribution is independent
 #' of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -7,3 +8,6 @@
 #' Results may be very different if many observations are censored at the last
 #' observed time due to division by \eqn{1/eps} in `proper = TRUE`.
 #'
+#' See Sonabend et al. (2024) for more details.
+#' The use of `proper = TRUE` is considered experimental and should be used with caution.
+#'
diff --git a/man/mlr_measures_surv.chambless_auc.Rd b/man/mlr_measures_surv.chambless_auc.Rd
index 6b8476ace..88e68bca9 100644
--- a/man/mlr_measures_surv.chambless_auc.Rd
+++ b/man/mlr_measures_surv.chambless_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Chambless LE, Diao G (2006).
 \dQuote{Estimation of time-dependent area under the ROC curve for long-term risk prediction.}
diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd
index 5697b9764..9fba1e787 100644
--- a/man/mlr_measures_surv.cindex.Rd
+++ b/man/mlr_measures_surv.cindex.Rd
@@ -105,6 +105,7 @@ p$score(msr("surv.cindex", weight_meth = "G2"),
 
 # Harrell's C-index evaluated up to a specific time horizon
 p$score(msr("surv.cindex", t_max = 97))
+
 # Harrell's C-index evaluated up to the time corresponding to 30\% of censoring
 p$score(msr("surv.cindex", p_max = 0.3))
 
diff --git a/man/mlr_measures_surv.dcalib.Rd b/man/mlr_measures_surv.dcalib.Rd
index e0a7779e0..d096a8c35 100644
--- a/man/mlr_measures_surv.dcalib.Rd
+++ b/man/mlr_measures_surv.dcalib.Rd
@@ -5,6 +5,8 @@
 \alias{MeasureSurvDCalibration}
 \title{D-Calibration Survival Measure}
 \description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
 This calibration method is defined by calculating the following statistic:
 \deqn{s = B/n \sum_i (P_i - n/B)^2}
 where \eqn{B} is number of 'buckets' (that equally divide \eqn{[0,1]} into intervals),
@@ -14,8 +16,8 @@ of observations in the \eqn{i}th interval. An observation is assigned to the
 falls within the corresponding interval.
 This statistic assumes that censoring time is independent of death time.
 
-A model is well-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test}
-(\eqn{p > 0.05} if well-calibrated).
+A model is well D-calibrated if \eqn{s \sim Unif(B)}, tested with \code{chisq.test}
+(\eqn{p > 0.05} if well-calibrated, i.e. higher p-values are preferred).
 Model \eqn{i} is better calibrated than model \eqn{j} if \eqn{s(i) < s(j)},
 meaning that \emph{lower values} of this measure are preferred.
 }
@@ -25,7 +27,7 @@ The former is useful for model comparison whereas the latter is useful for deter
 is well-calibrated. If \code{chisq = FALSE} and \code{s} is the predicted value then you can manually
 compute the p.value with \code{pchisq(s, B - 1, lower.tail = FALSE)}.
 
-NOTE: This measure is still experimental both theoretically and in implementation. Results
+\strong{NOTE}: This measure is still experimental both theoretically and in implementation. Results
 should therefore only be taken as an indicator of performance and not for
 conclusive judgements about model calibration.
 }
@@ -72,11 +74,12 @@ Default is \code{FALSE} and returns the statistic \code{s}.
 You can manually get the p-value by executing \code{pchisq(s, B - 1, lower.tail = FALSE)}.
 The null hypothesis is that the model is D-calibrated.
 \item \code{truncate} (\code{double(1)}) \cr
-This parameter controls the upper bound of the output statistic,
-when \code{chisq} is \code{FALSE}. We use \code{truncate = Inf} by default but \eqn{10} may be sufficient
-for most purposes, which corresponds to a p-value of 0.35 for the chisq.test using
-\eqn{B = 10} buckets. Values \eqn{>10} translate to even lower p-values and thus
-less calibrated models. If the number of buckets \eqn{B} changes, you probably will want to
+This parameter controls the upper bound of the output statistic, when \code{chisq} is \code{FALSE}.
+We use \code{truncate = Inf} by default but values between \eqn{10-16} are sufficient
+for most purposes, which correspond to p-values of \eqn{0.35-0.06} for the \code{chisq.test} using
+the default \eqn{B = 10} buckets.
+Values \eqn{B > 10} translate to even lower p-values and thus less D-calibrated models.
+If the number of buckets \eqn{B} changes, you probably will want to
 change the \code{truncate} value as well to correspond to the same p-value significance.
 Note that truncation may severely limit automated tuning with this measure.
 }
diff --git a/man/mlr_measures_surv.graf.Rd b/man/mlr_measures_surv.graf.Rd
index 4d2924dfc..37b26f853 100644
--- a/man/mlr_measures_surv.graf.Rd
+++ b/man/mlr_measures_surv.graf.Rd
@@ -17,13 +17,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISBS} (RISBS) is:
 
-\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -54,6 +54,7 @@ msr("surv.graf")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -85,11 +86,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -126,9 +126,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -146,10 +147,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISBS is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -157,6 +167,9 @@ ISBS is never proper. Use \code{proper = FALSE} for ISBS and
 \code{proper = TRUE} for RISBS.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -196,29 +209,98 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+truncated at the time cutoff for all observations.
+
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISBS, G(t) calculated using the test set
+p$score(msr("surv.graf"))
+
+# ISBS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.graf"), task = task, train_set = part$train)
+
+# ISBS, ERV score (comparing with KM baseline)
+p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train)
+
+# ISBS at specific time point
+p$score(msr("surv.graf", times = 365), task = task, train_set = part$train)
+
+# ISBS at multiple time points (integrated)
+p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISBS, use time cutoff
+p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train)
+
+# ISBS, use time cutoff and also remove observations
+p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train)
+
+# RISBS, G(t) calculated using the train set
+p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
 \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545.
 \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Havard, Borgan, Ornulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.hung_auc.Rd b/man/mlr_measures_surv.hung_auc.Rd
index e5207db4e..3f05b93ce 100644
--- a/man/mlr_measures_surv.hung_auc.Rd
+++ b/man/mlr_measures_surv.hung_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Hung H, Chiang C (2010).
 \dQuote{Estimation methods for time-dependent AUC models with survival data.}
diff --git a/man/mlr_measures_surv.intlogloss.Rd b/man/mlr_measures_surv.intlogloss.Rd
index 5a6d7af85..9498d433c 100644
--- a/man/mlr_measures_surv.intlogloss.Rd
+++ b/man/mlr_measures_surv.intlogloss.Rd
@@ -15,13 +15,13 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0  \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISLL} (RISLL) is:
 
-\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0  \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
@@ -52,6 +52,7 @@ msr("surv.intlogloss")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -83,11 +84,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -124,9 +124,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -144,10 +145,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISLL is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -155,6 +165,9 @@ ISLL is never proper. Use \code{proper = FALSE} for ISLL and
 \code{proper = TRUE} for RISLL.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -194,29 +207,98 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+truncated at the time cutoff for all observations.
+
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISLL, G(t) calculated using the test set
+p$score(msr("surv.intlogloss"))
+
+# ISLL, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.intlogloss"), task = task, train_set = part$train)
+
+# ISLL, ERV score (comparing with KM baseline)
+p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train)
+
+# ISLL at specific time point
+p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train)
+
+# ISLL at multiple time points (integrated)
+p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISLL, use time cutoff
+p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train)
+
+# ISLL, use time cutoff and also remove observations
+p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train)
+
+# RISLL, G(t) calculated using the train set
+p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Graf E, Schmoor C, Sauerbrei W, Schumacher M (1999).
 \dQuote{Assessment and comparison of prognostic classification schemes for survival data.}
 \emph{Statistics in Medicine}, \bold{18}(17-18), 2529--2545.
 \doi{10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Havard, Borgan, Ornulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.logloss.Rd b/man/mlr_measures_surv.logloss.Rd
index 245364758..bc7241647 100644
--- a/man/mlr_measures_surv.logloss.Rd
+++ b/man/mlr_measures_surv.logloss.Rd
@@ -86,19 +86,27 @@ Default is \code{FALSE}.
 
 \itemize{
 \item \code{IPCW} (\code{logical(1)})\cr
-If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper).
+If \code{TRUE} (default) then returns the \eqn{L_{RNLL}} score (which is proper), otherwise the \eqn{L_{NLL}} score (improper). See Sonabend et al. (2024) for more details.
 }
 }
 
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
+\references{
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
+}
 \seealso{
 Other survival measures: 
 \code{\link{mlr_measures_surv.calib_alpha}},
diff --git a/man/mlr_measures_surv.schmid.Rd b/man/mlr_measures_surv.schmid.Rd
index d01cd26ea..03723f90a 100644
--- a/man/mlr_measures_surv.schmid.Rd
+++ b/man/mlr_measures_surv.schmid.Rd
@@ -14,25 +14,19 @@ outcome \eqn{(t_i, \delta_i)} (time and censoring indicator) and predicted
 survival function \eqn{S_i(t)}, the \emph{observation-wise} loss integrated across
 the time dimension up to the time cutoff \eqn{\tau^*}, is:
 
-\deqn{L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
+\deqn{L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau}
 
 where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
 
 The \strong{re-weighted ISS} (RISS) is:
 
-\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0  \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau}
+\deqn{L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0  S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}}
 
 which is always weighted by \eqn{G(t_i)} and is equal to zero for a censored subject.
 
 To get a single score across all \eqn{N} observations of the test set, we
 return the average of the time-integrated observation-wise scores:
 \deqn{\sum_{i=1}^N L(S_i, t_i, \delta_i) / N}
-
-\deqn{L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]}
-where \eqn{G} is the Kaplan-Meier estimate of the censoring distribution.
-
-The re-weighted ISS, RISS is given by
-\deqn{L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]}
 }
 \section{Dictionary}{
 
@@ -57,6 +51,7 @@ msr("surv.schmid")
    proper \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
    eps \tab numeric \tab 0.001 \tab  \tab \eqn{[0, 1]}{[0, 1]} \cr
    ERV \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
+   remove_obs \tab logical \tab FALSE \tab TRUE, FALSE \tab - \cr
 }
 }
 
@@ -88,11 +83,10 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 
 \itemize{
 \item \code{t_max} (\code{numeric(1)})\cr
-Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to.
+Cutoff time \eqn{\tau^*} (i.e. time horizon) to evaluate the measure up to
+(truncate \eqn{S(t)}).
 Mutually exclusive with \code{p_max} or \code{times}.
-This will effectively remove test observations for which the observed time
-(event or censoring) is strictly more than \code{t_max}.
-It's recommended to set \code{t_max} to avoid division by \code{eps}, see Details.
+It's recommended to set \code{t_max} to avoid division by \code{eps}, see "Time Cutoff Details" section.
 If \code{t_max} is not specified, an \code{Inf} time horizon is assumed.
 }
 
@@ -129,9 +123,10 @@ Default is \code{FALSE} (returns the mean).
 If \code{TRUE} then weights scores by the censoring distribution at
 the observed event time, which results in a strictly proper scoring
 rule if censoring and survival time distributions are independent
-and a sufficiently large dataset is used.
+and a sufficiently large dataset is used, see Sonabend et al. (2024).
 If \code{FALSE} then weights scores by the Graf method which is the
 more common usage but the loss is not proper.
+See "Properness" section for more details.
 }
 
 
@@ -149,10 +144,19 @@ If \code{TRUE} then the Explained Residual Variation method is applied, which
 means the score is standardized against a Kaplan-Meier baseline.
 Default is \code{FALSE}.
 }
+
+
+\itemize{
+\item \code{remove_obs} (\code{logical(1)})\cr
+Only effective when \code{t_max} or \code{p_max} is provided. Default is \code{FALSE}.
+If \code{TRUE}, then we \strong{remove test observations} for which the observed time (event or censoring) is strictly larger than \code{t_max}.
+See "Time Cutoff Details" section for more details.
+}
 }
 
 \section{Properness}{
 
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
 
 RISS is strictly proper when the censoring distribution is independent
 of the survival distribution and when \eqn{G(t)} is fit on a sufficiently large dataset.
@@ -160,6 +164,9 @@ ISS is never proper. Use \code{proper = FALSE} for ISS and
 \code{proper = TRUE} for RISS.
 Results may be very different if many observations are censored at the last
 observed time due to division by \eqn{1/eps} in \code{proper = TRUE}.
+
+See Sonabend et al. (2024) for more details.
+The use of \code{proper = TRUE} is considered experimental and should be used with caution.
 }
 
 \section{Time points used for evaluation}{
@@ -199,24 +206,84 @@ not used).
 \section{Data used for Estimating Censoring Distribution}{
 
 
-If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
+If \code{task} and \code{train_set} are passed to \verb{$score} then \eqn{G(t)} is fit using
+\strong{all observations} from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \eqn{G(t)} via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
 }
 
 \section{Time Cutoff Details}{
 
 
-If \code{t_max} or \code{p_max} is given, then \eqn{G(t)} will be fitted using \strong{all observations} from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the \code{t_max} can help alleviate inflation of the score when \code{proper} is \code{TRUE},
-in cases where an observation is censored at the last observed time point.
-This results in \eqn{G(t_{max}) = 0} and the use of \code{eps} instead (when \code{t_max} is \code{NULL}).
+If \code{t_max} or \code{p_max} is given, then the predicted survival function \eqn{S(t)} is
+truncated at the time cutoff for all observations.
+
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+Also, if \code{remove_obs = TRUE}, \strong{observations with observed times} \eqn{t > t_{max}} \strong{are removed}.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff \code{t_max}, helps mitigate
+the \strong{inflation of the score} observed when an observation is censored at the
+final time point. In such cases, \eqn{G(t) = 0}, triggering the use of a
+small constant \code{eps} instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the \code{t_max} and \code{remove_obs} parameters do not affect the estimation
+of the censoring distribution, i.e. \strong{always all the observations are used for estimating} \eqn{G(t)}.
+
+If \code{remove_obs = FALSE}, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISS, G(t) calculated using the test set
+p$score(msr("surv.schmid"))
+
+# ISS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.schmid"), task = task, train_set = part$train)
+
+# ISS, ERV score (comparing with KM baseline)
+p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train)
+
+# ISS at specific time point
+p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train)
+
+# ISS at multiple time points (integrated)
+p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+
+# ISS, use time cutoff
+p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train)
+
+# ISS, use time cutoff and also remove observations
+p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+
+# ISS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train)
+
+# RISS, G(t) calculated using the train set
+p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train)
+
+}
 \references{
 Schemper, Michael, Henderson, Robin (2000).
 \dQuote{Predictive Accuracy and Explained Variation in Cox Regression.}
@@ -227,6 +294,15 @@ Schmid, Matthias, Hielscher, Thomas, Augustin, Thomas, Gefeller, Olaf (2011).
 \dQuote{A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error.}
 \emph{Biometrics}, \bold{67}(2), 524--535.
 \doi{10.1111/j.1541-0420.2010.01459.x}.
+
+Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024).
+\dQuote{Examining properness in the external validation of survival models with squared and logarithmic losses.}
+\url{https://arxiv.org/abs/2212.05260v2}.
+
+Kvamme, Havard, Borgan, Ornulf (2023).
+\dQuote{The Brier Score under Administrative Censoring: Problems and a Solution.}
+\emph{Journal of Machine Learning Research}, \bold{24}(2), 1--26.
+ISSN 1533-7928, \url{http://jmlr.org/papers/v24/19-1030.html}.
 }
 \seealso{
 Other survival measures: 
diff --git a/man/mlr_measures_surv.song_auc.Rd b/man/mlr_measures_surv.song_auc.Rd
index f57df5545..deceab503 100644
--- a/man/mlr_measures_surv.song_auc.Rd
+++ b/man/mlr_measures_surv.song_auc.Rd
@@ -69,6 +69,32 @@ incident TPR, \code{cumulative} refers to cumulative TPR.
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Song, Xiao, Zhou, Xiao-Hua (2008).
 \dQuote{A semiparametric approach for the covariate specific ROC curve with survival outcome.}
diff --git a/man/mlr_measures_surv.uno_auc.Rd b/man/mlr_measures_surv.uno_auc.Rd
index 5536ad7ad..b4206ac29 100644
--- a/man/mlr_measures_surv.uno_auc.Rd
+++ b/man/mlr_measures_surv.uno_auc.Rd
@@ -61,6 +61,32 @@ If \code{integrated == FALSE} then a single time point at which to return the sc
 }
 }
 
+\examples{
+library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox)
+
+# AUC at specific time point
+p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox)
+
+# Integrated AUC at specific time points
+p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+
+}
 \references{
 Uno H, Cai T, Tian L, Wei LJ (2007).
 \dQuote{Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models.}
diff --git a/src/survival_scores.cpp b/src/survival_scores.cpp
index 52c2f396b..6d569f9bb 100644
--- a/src/survival_scores.cpp
+++ b/src/survival_scores.cpp
@@ -91,13 +91,12 @@ NumericMatrix c_weight_survival_score(const NumericMatrix& score,
               k = 1;
               break;
             } else if (times[i] >= cens_times[l] &&
-              (l == cens_times.length() - 1 ||
-              times[i] < cens_times[l + 1])) {
+              (l == cens_times.length() - 1 || times[i] < cens_times[l + 1])) {
               k = cens_surv[l];
-              // k == 0 only if last obsv censored, therefore mat is set to 0
-              // anyway This division by eps can cause inflation of the score,
-              // due to a very large value for a particular (i-obs, j-time) use
-              // 't_max' to filter 'cens' in that case
+              // k == 0 only if last obs censored, therefore mat is set to 0 anyway
+              // This division by eps can cause inflation of the score,
+              // due to a very large value for a particular (i-obs, j-time)
+              // Use 't_max' to filter 'cens' in that case
               if (k == 0) {
                 k = eps;
               }
diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R
index c40630bf5..486e3fd3a 100644
--- a/tests/testthat/test_mlr_measures.R
+++ b/tests/testthat/test_mlr_measures.R
@@ -163,15 +163,18 @@ test_that("graf: t_max, p_max, times", {
   t_max = 100
   times_flt = times[times <= t_max] # keep only times until the `t_max`
   m0 = p$score(msr("surv.graf")) # uses all test time points
-  m1 = p$score(msr("surv.graf", times = times_flt)) # uses times_flt
+  m1 = p$score(msr("surv.graf", times = times_flt)) # uses `times_flt`
   m2 = p$score(msr("surv.graf", t_max = t_max)) # 100
+  m22 = p$score(msr("surv.graf", t_max = t_max, remove_obs = TRUE)) # 100
   m3 = p$score(msr("surv.graf", t_max = max(times))) # 104
-  m4 = p$score(msr("surv.graf", t_max = max(times) + 1)) # 105
+  m4 = p$score(msr("surv.graf", t_max = max(times) + 10)) # 105
 
   # different time points considered
   expect_true(m0 != m1)
-  # same time points are used, but `t_max` also removes observations
-  expect_true(m1 != m2)
+  # same time points are used, and no removal of observations (original Graf score)
+  expect_equal(m1, m2)
+  # same time points are used, but observations with `t > t_max` are removed
+  expect_true(m2 != m22)
   # different `t_max` => different time points used
   expect_true(m2 != m3)
   # different `t_max` but after the max evaluation time point, so result stays the same