From 1e4bae6a4b11af73d27d5d34b103efc589bc631a Mon Sep 17 00:00:00 2001
From: bblodfon This calibration method is defined by calculating the following statistic:
+ This calibration method is defined by calculating the following statistic:
$$s = B/n \sum_i (P_i - n/B)^2$$
where \(B\) is number of 'buckets' (that equally divide \([0,1]\) into intervals),
\(n\) is the number of predictions, and \(P_i\) is the observed proportion
@@ -82,8 +85,8 @@ A model is well-calibrated if \(s \sim Unif(B)\), tested with A model is well D-calibrated if \(s \sim Unif(B)\), tested with Changelog
mlr3proba 0.7.1
-PipeOp
s and pipelines related to survival => regression reduction techniques (see #414)$predict_type
of survtoclassif_disctime
and survtoclassif_IPCW
was prob
(classification type) and not crank
(survival type)PipeOp
s and pipelines related to survival => regression reduction techniques (see #414)$predict_type
of survtoclassif_disctime
and survtoclassif_IPCW
was prob
(classification type) and not crank
(survival type)t_max|p_max
is specified in scoring rules (didn’t influence evaluation at all)t_max
in scoring rules, added examples in scoring rules and AUC scoresremove_obs
in scoring rules to remove observations with observed time t > t_max
as a processing step to alleviate IPCW issues. This was before ‘hard-coded’ which made the Integrated Brier Score (msr("surv.graf")
) differ minimally from other implementations and the original definition.mlr3proba 0.7.0
diff --git a/pkgdown.yml b/pkgdown.yml
index 1961a056..2ab2f03f 100644
--- a/pkgdown.yml
+++ b/pkgdown.yml
@@ -2,7 +2,7 @@ pandoc: 3.1.11
pkgdown: 2.1.1
pkgdown_sha: ~
articles: {}
-last_built: 2024-12-11T14:20Z
+last_built: 2024-12-18T19:24Z
urls:
reference: https://mlr3proba.mlr-org.com/reference
article: https://mlr3proba.mlr-org.com/articles
diff --git a/reference/index.html b/reference/index.html
index e02795dc..64a7d87b 100644
--- a/reference/index.html
+++ b/reference/index.html
@@ -192,7 +192,7 @@ Survival Measuresmlr_measures_surv.dcalib
MeasureSurvDCalibration
-
+ experimental
Arguments
+
Examples
+ library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc
+#> 0.741112
+
+# AUC at specific time point
+p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc
+#> 0.7985259
+
+# Integrated AUC at specific time points
+p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc
+#> 0.7276849
+
+
Examples# Harrell's C-index
p$score(msr("surv.cindex")) # same as `p$score()`
#> surv.cindex
-#> 0.775
+#> 0.7987073
# Uno's C-index
p$score(msr("surv.cindex", weight_meth = "G2"),
task = task, train_set = part$train)
#> surv.cindex
-#> 0.7766333
+#> 0.8142707
# Harrell's C-index evaluated up to a specific time horizon
p$score(msr("surv.cindex", t_max = 97))
#> surv.cindex
-#> 0.7730956
+#> 0.7944026
+
# Harrell's C-index evaluated up to the time corresponding to 30% of censoring
p$score(msr("surv.cindex", p_max = 0.3))
#> surv.cindex
-#> 0.762
+#> 0.7766423
diff --git a/reference/mlr_measures_surv.dcalib.html b/reference/mlr_measures_surv.dcalib.html
index b6aa58c6..f22eaad4 100644
--- a/reference/mlr_measures_surv.dcalib.html
+++ b/reference/mlr_measures_surv.dcalib.html
@@ -1,5 +1,6 @@
-
D-Calibration Survival Measure
D-Calibration Survival Measure
\(i\)th bucket, if its predicted survival probability at the time of event
falls within the corresponding interval.
This statistic assumes that censoring time is independent of death time.
-chisq.test
-(\(p > 0.05\) if well-calibrated).
+chisq.test
+(\(p > 0.05\) if well-calibrated, i.e. higher p-values are preferred).
Model \(i\) is better calibrated than model \(j\) if \(s(i) < s(j)\),
meaning that lower values of this measure are preferred.Details
The former is useful for model comparison whereas the latter is useful for determining if a model
is well-calibrated. If
chisq = FALSE
and s
is the predicted value then you can manually
compute the p.value with pchisq(s, B - 1, lower.tail = FALSE)
.
NOTE: This measure is still experimental both theoretically and in implementation. Results +
NOTE: This measure is still experimental both theoretically and in implementation. Results should therefore only be taken as an indicator of performance and not for conclusive judgements about model calibration.
@@ -136,11 +139,12 @@$$L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
+$$L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
where \(G\) is the Kaplan-Meier estimate of the censoring distribution.
The re-weighted ISBS (RISBS) is:
-$$L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$
+$$L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$
which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.
To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: @@ -88,7 +88,7 @@
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
remove_obs | logical | FALSE | TRUE, FALSE | - |
Inf
time horizon is assumed.
RISBS is strictly proper when the censoring distribution is independent
of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset.
ISBS is never proper. Use proper = FALSE
for ISBS and
proper = TRUE
for RISBS.
Results may be very different if many observations are censored at the last
observed time due to division by \(1/eps\) in proper = TRUE
.
See Sonabend et al. (2024) for more details.
+The use of proper = TRUE
is considered experimental and should be used with caution.
If task
and train_set
are passed to $score
then \(G(t)\) is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
If task
and train_set
are passed to $score
then \(G(t)\) is fit using
+all observations from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \(G(t)\) via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
If t_max
or p_max
is given, then \(G(t)\) will be fitted using all observations from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the t_max
can help alleviate inflation of the score when proper
is TRUE
,
-in cases where an observation is censored at the last observed time point.
-This results in \(G(t_{max}) = 0\) and the use of eps
instead (when t_max
is NULL
).
If t_max
or p_max
is given, then the predicted survival function \(S(t)\) is
+truncated at the time cutoff for all observations.
Also, if remove_obs = TRUE
, observations with observed times \(t > t_{max}\) are removed.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff t_max
, helps mitigate
+the inflation of the score observed when an observation is censored at the
+final time point. In such cases, \(G(t) = 0\), triggering the use of a
+small constant eps
instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the t_max
and remove_obs
parameters do not affect the estimation
+of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).
If remove_obs = FALSE
, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.
+Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISBS, G(t) calculated using the test set
+p$score(msr("surv.graf"))
+#> surv.graf
+#> 0.1660985
+
+# ISBS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.graf"), task = task, train_set = part$train)
+#> surv.graf
+#> 0.18103
+
+# ISBS, ERV score (comparing with KM baseline)
+p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train)
+#> surv.graf
+#> 0.01022285
+
+# ISBS at specific time point
+p$score(msr("surv.graf", times = 365), task = task, train_set = part$train)
+#> surv.graf
+#> 0.2754057
+
+# ISBS at multiple time points (integrated)
+p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.graf
+#> 0.224104
+
+# ISBS, use time cutoff
+p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train)
+#> surv.graf
+#> 0.2009658
+
+# ISBS, use time cutoff and also remove observations
+p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.graf
+#> 0.1851955
+
+# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train)
+#> surv.graf
+#> 0.2022914
+
+# RISBS, G(t) calculated using the train set
+p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train)
+#> surv.graf
+#> 0.1896551
+
+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc
+#> 1.172293
+
+# AUC at specific time point
+p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc
+#> 1.568774
+
+# Integrated AUC at specific time points
+p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc
+#> 0.9826444
+
+
$$L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
+$$L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
where \(G\) is the Kaplan-Meier estimate of the censoring distribution.
The re-weighted ISLL (RISLL) is:
-$$L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$
+$$L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$
which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.
To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: @@ -88,7 +88,7 @@
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
remove_obs | logical | FALSE | TRUE, FALSE | - |
Inf
time horizon is assumed.
RISLL is strictly proper when the censoring distribution is independent
of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset.
ISLL is never proper. Use proper = FALSE
for ISLL and
proper = TRUE
for RISLL.
Results may be very different if many observations are censored at the last
observed time due to division by \(1/eps\) in proper = TRUE
.
See Sonabend et al. (2024) for more details.
+The use of proper = TRUE
is considered experimental and should be used with caution.
If task
and train_set
are passed to $score
then \(G(t)\) is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
If task
and train_set
are passed to $score
then \(G(t)\) is fit using
+all observations from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \(G(t)\) via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
If t_max
or p_max
is given, then \(G(t)\) will be fitted using all observations from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the t_max
can help alleviate inflation of the score when proper
is TRUE
,
-in cases where an observation is censored at the last observed time point.
-This results in \(G(t_{max}) = 0\) and the use of eps
instead (when t_max
is NULL
).
If t_max
or p_max
is given, then the predicted survival function \(S(t)\) is
+truncated at the time cutoff for all observations.
Also, if remove_obs = TRUE
, observations with observed times \(t > t_{max}\) are removed.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff t_max
, helps mitigate
+the inflation of the score observed when an observation is censored at the
+final time point. In such cases, \(G(t) = 0\), triggering the use of a
+small constant eps
instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the t_max
and remove_obs
parameters do not affect the estimation
+of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).
If remove_obs = FALSE
, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.
+Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISLL, G(t) calculated using the test set
+p$score(msr("surv.intlogloss"))
+#> surv.intlogloss
+#> 0.4976977
+
+# ISLL, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.intlogloss"), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.5302097
+
+# ISLL, ERV score (comparing with KM baseline)
+p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss
+#> -0.1576643
+
+# ISLL at specific time point
+p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.6735551
+
+# ISLL at multiple time points (integrated)
+p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.6035742
+
+# ISLL, use time cutoff
+p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.5791972
+
+# ISLL, use time cutoff and also remove observations
+p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.5369217
+
+# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.5808817
+
+# RISLL, G(t) calculated using the train set
+p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss
+#> 0.3776268
+
+
If task
and train_set
are passed to $score
then \(G(t)\) is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
If task
and train_set
are passed to $score
then \(G(t)\) is fit using
+all observations from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \(G(t)\) via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.
$$L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
+$$L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$
where \(G\) is the Kaplan-Meier estimate of the censoring distribution.
The re-weighted ISS (RISS) is:
-$$L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$
+$$L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$
which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.
To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: $$\sum_{i=1}^N L(S_i, t_i, \delta_i) / N$$
-$$L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]$$ -where \(G\) is the Kaplan-Meier estimate of the censoring distribution.
-The re-weighted ISS, RISS is given by -$$L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]$$
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
Id | Type | Default | Levels | Range |
integrated | logical | TRUE | TRUE, FALSE | - |
times | untyped | - | - | |
t_max | numeric | - | \([0, \infty)\) | |
p_max | numeric | - | \([0, 1]\) | |
method | integer | 2 | \([1, 2]\) | |
se | logical | FALSE | TRUE, FALSE | - |
proper | logical | FALSE | TRUE, FALSE | - |
eps | numeric | 0.001 | \([0, 1]\) | |
ERV | logical | FALSE | TRUE, FALSE | - |
remove_obs | logical | FALSE | TRUE, FALSE | - |
Inf
time horizon is assumed.
RISS is strictly proper when the censoring distribution is independent
of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset.
ISS is never proper. Use proper = FALSE
for ISS and
proper = TRUE
for RISS.
Results may be very different if many observations are censored at the last
observed time due to division by \(1/eps\) in proper = TRUE
.
See Sonabend et al. (2024) for more details.
+The use of proper = TRUE
is considered experimental and should be used with caution.
If task
and train_set
are passed to $score
then \(G(t)\) is fit on training data,
-otherwise testing data. The first is likely to reduce any bias caused by calculating
-parts of the measure on the test data it is evaluating. The training data is automatically
-used in scoring resamplings.
If task
and train_set
are passed to $score
then \(G(t)\) is fit using
+all observations from the train set, otherwise the test set is used.
+Using the train set is likely to reduce any bias caused by calculating parts of the
+measure on the test data it is evaluating.
+Also usually it means that more data is used for fitting the censoring
+distribution \(G(t)\) via the Kaplan-Meier.
+The training data is automatically used in scoring resamplings.
If t_max
or p_max
is given, then \(G(t)\) will be fitted using all observations from the
-train set (or test set) and only then the cutoff time will be applied.
-This is to ensure that more data is used for fitting the censoring distribution via the
-Kaplan-Meier.
-Setting the t_max
can help alleviate inflation of the score when proper
is TRUE
,
-in cases where an observation is censored at the last observed time point.
-This results in \(G(t_{max}) = 0\) and the use of eps
instead (when t_max
is NULL
).
If t_max
or p_max
is given, then the predicted survival function \(S(t)\) is
+truncated at the time cutoff for all observations.
Also, if remove_obs = TRUE
, observations with observed times \(t > t_{max}\) are removed.
+This data preprocessing step mitigates issues that arise when using IPCW
+in cases of administrative censoring, see Kvamme et al. (2023).
+Practically, this step, along with setting a time cutoff t_max
, helps mitigate
+the inflation of the score observed when an observation is censored at the
+final time point. In such cases, \(G(t) = 0\), triggering the use of a
+small constant eps
instead.
+This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024)
+for more details.
+Note that the t_max
and remove_obs
parameters do not affect the estimation
+of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).
If remove_obs = FALSE
, inflated scores may occur. While this aligns more closely
+with the definitions presented in the original papers, it can lead to misleading
+evaluation and poor optimization outcomes when using this score for model tuning.
Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.
+Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISS, G(t) calculated using the test set
+p$score(msr("surv.schmid"))
+#> surv.schmid
+#> 0.2901703
+
+# ISS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.schmid"), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.2568166
+
+# ISS, ERV score (comparing with KM baseline)
+p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.01762828
+
+# ISS at specific time point
+p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.4244478
+
+# ISS at multiple time points (integrated)
+p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.3551183
+
+# ISS, use time cutoff
+p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.3236368
+
+# ISS, use time cutoff and also remove observations
+p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.2893284
+
+# ISS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.3300005
+
+# RISS, G(t) calculated using the train set
+p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train)
+#> surv.schmid
+#> 0.2003059
+
+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc
+#> 0.633294
+
+# AUC at specific time point
+p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc
+#> 0.6156616
+
+# Integrated AUC at specific time points
+p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc
+#> 0.631936
+
+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc
+#> 0.7062886
+
+# AUC at specific time point
+p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc
+#> 0.645577
+
+# Integrated AUC at specific time points
+p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc
+#> 0.6787879
+
+