From 1e4bae6a4b11af73d27d5d34b103efc589bc631a Mon Sep 17 00:00:00 2001 From: bblodfon Date: Wed, 18 Dec 2024 19:25:30 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20mlr-org/?= =?UTF-8?q?mlr3proba@a35a4c446c6054f0e681331fc7c6d9d270ba42fe=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- news/index.html | 7 +- pkgdown.yml | 2 +- reference/index.html | 2 +- .../mlr_measures_surv.chambless_auc.html | 34 +++++ reference/mlr_measures_surv.cindex.html | 9 +- reference/mlr_measures_surv.dcalib.html | 34 +++-- reference/mlr_measures_surv.graf.html | 132 ++++++++++++++--- reference/mlr_measures_surv.hung_auc.html | 34 +++++ reference/mlr_measures_surv.intlogloss.html | 132 ++++++++++++++--- reference/mlr_measures_surv.logloss.html | 19 ++- reference/mlr_measures_surv.schmid.html | 136 ++++++++++++++---- reference/mlr_measures_surv.song_auc.html | 34 +++++ reference/mlr_measures_surv.uno_auc.html | 34 +++++ reference/mlr_task_generators_coxed.html | 2 +- reference/mlr_task_generators_simdens.html | 16 +-- reference/mlr_task_generators_simsurv.html | 28 ++-- search.json | 2 +- 17 files changed, 538 insertions(+), 119 deletions(-) diff --git a/news/index.html b/news/index.html index b151d845..fc3e1efc 100644 --- a/news/index.html +++ b/news/index.html @@ -52,8 +52,11 @@

Changelog

mlr3proba 0.7.1

-

mlr3proba 0.7.0

diff --git a/pkgdown.yml b/pkgdown.yml index 1961a056..2ab2f03f 100644 --- a/pkgdown.yml +++ b/pkgdown.yml @@ -2,7 +2,7 @@ pandoc: 3.1.11 pkgdown: 2.1.1 pkgdown_sha: ~ articles: {} -last_built: 2024-12-11T14:20Z +last_built: 2024-12-18T19:24Z urls: reference: https://mlr3proba.mlr-org.com/reference article: https://mlr3proba.mlr-org.com/articles diff --git a/reference/index.html b/reference/index.html index e02795dc..64a7d87b 100644 --- a/reference/index.html +++ b/reference/index.html @@ -192,7 +192,7 @@

Survival Measuresmlr_measures_surv.dcalib MeasureSurvDCalibration - + experimental
D-Calibration Survival Measure
diff --git a/reference/mlr_measures_surv.chambless_auc.html b/reference/mlr_measures_surv.chambless_auc.html index 326ab105..1615cd67 100644 --- a/reference/mlr_measures_surv.chambless_auc.html +++ b/reference/mlr_measures_surv.chambless_auc.html @@ -200,6 +200,40 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.chambless_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc 
+#>           0.741112 
+
+# AUC at specific time point
+p$score(msr("surv.chambless_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc 
+#>          0.7985259 
+
+# Integrated AUC at specific time points
+p$score(msr("surv.chambless_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.chambless_auc 
+#>          0.7276849 
+
+
+

diff --git a/reference/mlr_measures_surv.cindex.html b/reference/mlr_measures_surv.cindex.html index 5bd61e39..9649f2d9 100644 --- a/reference/mlr_measures_surv.cindex.html +++ b/reference/mlr_measures_surv.cindex.html @@ -231,22 +231,23 @@

Examples# Harrell's C-index p$score(msr("surv.cindex")) # same as `p$score()` #> surv.cindex -#> 0.775 +#> 0.7987073 # Uno's C-index p$score(msr("surv.cindex", weight_meth = "G2"), task = task, train_set = part$train) #> surv.cindex -#> 0.7766333 +#> 0.8142707 # Harrell's C-index evaluated up to a specific time horizon p$score(msr("surv.cindex", t_max = 97)) #> surv.cindex -#> 0.7730956 +#> 0.7944026 + # Harrell's C-index evaluated up to the time corresponding to 30% of censoring p$score(msr("surv.cindex", p_max = 0.3)) #> surv.cindex -#> 0.762 +#> 0.7766423 diff --git a/reference/mlr_measures_surv.dcalib.html b/reference/mlr_measures_surv.dcalib.html index b6aa58c6..f22eaad4 100644 --- a/reference/mlr_measures_surv.dcalib.html +++ b/reference/mlr_measures_surv.dcalib.html @@ -1,5 +1,6 @@ -D-Calibration Survival Measure — mlr_measures_surv.dcalib • mlr3probaD-Calibration Survival Measure — mlr_measures_surv.dcalib • mlr3proba Skip to contents @@ -74,7 +76,8 @@

D-Calibration Survival Measure

-

This calibration method is defined by calculating the following statistic: +

[Experimental]

+

This calibration method is defined by calculating the following statistic: $$s = B/n \sum_i (P_i - n/B)^2$$ where \(B\) is number of 'buckets' (that equally divide \([0,1]\) into intervals), \(n\) is the number of predictions, and \(P_i\) is the observed proportion @@ -82,8 +85,8 @@

D-Calibration Survival Measure

\(i\)th bucket, if its predicted survival probability at the time of event falls within the corresponding interval. This statistic assumes that censoring time is independent of death time.

-

A model is well-calibrated if \(s \sim Unif(B)\), tested with chisq.test -(\(p > 0.05\) if well-calibrated). +

A model is well D-calibrated if \(s \sim Unif(B)\), tested with chisq.test +(\(p > 0.05\) if well-calibrated, i.e. higher p-values are preferred). Model \(i\) is better calibrated than model \(j\) if \(s(i) < s(j)\), meaning that lower values of this measure are preferred.

@@ -95,7 +98,7 @@

Detailschisq = FALSE and s is the predicted value then you can manually compute the p.value with pchisq(s, B - 1, lower.tail = FALSE).

-

NOTE: This measure is still experimental both theoretically and in implementation. Results +

NOTE: This measure is still experimental both theoretically and in implementation. Results should therefore only be taken as an indicator of performance and not for conclusive judgements about model calibration.

@@ -136,11 +139,12 @@

Parameter detailsDetailsobservation-wise loss integrated across the time dimension up to the time cutoff \(\tau^*\), is:

-

$$L_{ISBS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

+

$$L_{ISBS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{(1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

where \(G\) is the Kaplan-Meier estimate of the censoring distribution.

The re-weighted ISBS (RISBS) is:

-

$$L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$

+

$$L_{RISBS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i^2(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau))^2 \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$

which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.

To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: @@ -88,7 +88,7 @@

Dictionary

Parameters

-
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-
+
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-
remove_obslogicalFALSETRUE, FALSE-

Properness

- +

[Experimental]

RISBS is strictly proper when the censoring distribution is independent of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset. ISBS is never proper. Use proper = FALSE for ISBS and proper = TRUE for RISBS. Results may be very different if many observations are censored at the last observed time due to division by \(1/eps\) in proper = TRUE.

+

See Sonabend et al. (2024) for more details. +The use of proper = TRUE is considered experimental and should be used with caution.

Time points used for evaluation

@@ -198,23 +204,36 @@

Data used for Estimatin -

If task and train_set are passed to $score then \(G(t)\) is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings.

+

If task and train_set are passed to $score then \(G(t)\) is fit using +all observations from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \(G(t)\) via the Kaplan-Meier. +The training data is automatically used in scoring resamplings.

Time Cutoff Details

-

If t_max or p_max is given, then \(G(t)\) will be fitted using all observations from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the t_max can help alleviate inflation of the score when proper is TRUE, -in cases where an observation is censored at the last observed time point. -This results in \(G(t_{max}) = 0\) and the use of eps instead (when t_max is NULL).

+

If t_max or p_max is given, then the predicted survival function \(S(t)\) is +truncated at the time cutoff for all observations.

+

[Experimental]

+

Also, if remove_obs = TRUE, observations with observed times \(t > t_{max}\) are removed. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff t_max, helps mitigate +the inflation of the score observed when an observation is censored at the +final time point. In such cases, \(G(t) = 0\), triggering the use of a +small constant eps instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the t_max and remove_obs parameters do not affect the estimation +of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).

+

If remove_obs = FALSE, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning.

References

@@ -223,6 +242,13 @@

Referencesdoi:10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5 .

+

Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.

+

Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.

See also

@@ -317,6 +343,70 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISBS, G(t) calculated using the test set
+p$score(msr("surv.graf"))
+#> surv.graf 
+#> 0.1660985 
+
+# ISBS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.graf"), task = task, train_set = part$train)
+#> surv.graf 
+#>   0.18103 
+
+# ISBS, ERV score (comparing with KM baseline)
+p$score(msr("surv.graf", ERV = TRUE), task = task, train_set = part$train)
+#>  surv.graf 
+#> 0.01022285 
+
+# ISBS at specific time point
+p$score(msr("surv.graf", times = 365), task = task, train_set = part$train)
+#> surv.graf 
+#> 0.2754057 
+
+# ISBS at multiple time points (integrated)
+p$score(msr("surv.graf", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.graf 
+#>  0.224104 
+
+# ISBS, use time cutoff
+p$score(msr("surv.graf", t_max = 700), task = task, train_set = part$train)
+#> surv.graf 
+#> 0.2009658 
+
+# ISBS, use time cutoff and also remove observations
+p$score(msr("surv.graf", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.graf 
+#> 0.1851955 
+
+# ISBS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.graf", p_max = 0.8), task = task, train_set = part$train)
+#> surv.graf 
+#> 0.2022914 
+
+# RISBS, G(t) calculated using the train set
+p$score(msr("surv.graf", proper = TRUE), task = task, train_set = part$train)
+#> surv.graf 
+#> 0.1896551 
+
+
+

diff --git a/reference/mlr_measures_surv.hung_auc.html b/reference/mlr_measures_surv.hung_auc.html index 1997b352..caac6a1c 100644 --- a/reference/mlr_measures_surv.hung_auc.html +++ b/reference/mlr_measures_surv.hung_auc.html @@ -199,6 +199,40 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.hung_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc 
+#>      1.172293 
+
+# AUC at specific time point
+p$score(msr("surv.hung_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc 
+#>      1.568774 
+
+# Integrated AUC at specific time points
+p$score(msr("surv.hung_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.hung_auc 
+#>     0.9826444 
+
+
+ diff --git a/reference/mlr_measures_surv.intlogloss.html b/reference/mlr_measures_surv.intlogloss.html index 520c093a..2bd56cfa 100644 --- a/reference/mlr_measures_surv.intlogloss.html +++ b/reference/mlr_measures_surv.intlogloss.html @@ -66,10 +66,10 @@

Detailsobservation-wise loss integrated across the time dimension up to the time cutoff \(\tau^*\), is:

-

$$L_{ISLL}(S_i, t_i, \delta_i) = -\text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

+

$$L_{ISLL}(S_i, t_i, \delta_i) = - \int^{\tau^*}_0 \frac{log[1-S_i(\tau)] \text{I}(t_i \leq \tau, \delta_i=1)}{G(t_i)} + \frac{\log[S_i(\tau)] \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

where \(G\) is the Kaplan-Meier estimate of the censoring distribution.

The re-weighted ISLL (RISLL) is:

-

$$L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{\log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$

+

$$L_{RISLL}(S_i, t_i, \delta_i) = -\delta_i \frac{\int^{\tau^*}_0 \log[1-S_i(\tau)]) \text{I}(t_i \leq \tau) + \log[S_i(\tau)] \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$

which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.

To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: @@ -88,7 +88,7 @@

Dictionary

Parameters

-
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-
+
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-
remove_obslogicalFALSETRUE, FALSE-

Properness

- +

[Experimental]

RISLL is strictly proper when the censoring distribution is independent of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset. ISLL is never proper. Use proper = FALSE for ISLL and proper = TRUE for RISLL. Results may be very different if many observations are censored at the last observed time due to division by \(1/eps\) in proper = TRUE.

+

See Sonabend et al. (2024) for more details. +The use of proper = TRUE is considered experimental and should be used with caution.

Time points used for evaluation

@@ -198,23 +204,36 @@

Data used for Estimatin -

If task and train_set are passed to $score then \(G(t)\) is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings.

+

If task and train_set are passed to $score then \(G(t)\) is fit using +all observations from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \(G(t)\) via the Kaplan-Meier. +The training data is automatically used in scoring resamplings.

Time Cutoff Details

-

If t_max or p_max is given, then \(G(t)\) will be fitted using all observations from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the t_max can help alleviate inflation of the score when proper is TRUE, -in cases where an observation is censored at the last observed time point. -This results in \(G(t_{max}) = 0\) and the use of eps instead (when t_max is NULL).

+

If t_max or p_max is given, then the predicted survival function \(S(t)\) is +truncated at the time cutoff for all observations.

+

[Experimental]

+

Also, if remove_obs = TRUE, observations with observed times \(t > t_{max}\) are removed. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff t_max, helps mitigate +the inflation of the score observed when an observation is censored at the +final time point. In such cases, \(G(t) = 0\), triggering the use of a +small constant eps instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the t_max and remove_obs parameters do not affect the estimation +of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).

+

If remove_obs = FALSE, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning.

References

@@ -223,6 +242,13 @@

Referencesdoi:10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5 .

+

Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.

+

Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.

See also

@@ -317,6 +343,70 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISLL, G(t) calculated using the test set
+p$score(msr("surv.intlogloss"))
+#> surv.intlogloss 
+#>       0.4976977 
+
+# ISLL, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.intlogloss"), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.5302097 
+
+# ISLL, ERV score (comparing with KM baseline)
+p$score(msr("surv.intlogloss", ERV = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>      -0.1576643 
+
+# ISLL at specific time point
+p$score(msr("surv.intlogloss", times = 365), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.6735551 
+
+# ISLL at multiple time points (integrated)
+p$score(msr("surv.intlogloss", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.6035742 
+
+# ISLL, use time cutoff
+p$score(msr("surv.intlogloss", t_max = 700), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.5791972 
+
+# ISLL, use time cutoff and also remove observations
+p$score(msr("surv.intlogloss", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.5369217 
+
+# ISLL, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.intlogloss", p_max = 0.8), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.5808817 
+
+# RISLL, G(t) calculated using the train set
+p$score(msr("surv.intlogloss", proper = TRUE), task = task, train_set = part$train)
+#> surv.intlogloss 
+#>       0.3776268 
+
+
+

diff --git a/reference/mlr_measures_surv.logloss.html b/reference/mlr_measures_surv.logloss.html index fb71d9f1..04b8ccf0 100644 --- a/reference/mlr_measures_surv.logloss.html +++ b/reference/mlr_measures_surv.logloss.html @@ -115,17 +115,26 @@

Parameter details

Data used for Estimating Censoring Distribution

-

If task and train_set are passed to $score then \(G(t)\) is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings.

+

If task and train_set are passed to $score then \(G(t)\) is fit using +all observations from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \(G(t)\) via the Kaplan-Meier. +The training data is automatically used in scoring resamplings.

+ +
+

References

+

Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.

See also

diff --git a/reference/mlr_measures_surv.schmid.html b/reference/mlr_measures_surv.schmid.html index 3871b8cc..7101a3cb 100644 --- a/reference/mlr_measures_surv.schmid.html +++ b/reference/mlr_measures_surv.schmid.html @@ -63,18 +63,14 @@

Detailsobservation-wise loss integrated across the time dimension up to the time cutoff \(\tau^*\), is:

-

$$L_{ISS}(S_i, t_i, \delta_i) = \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

+

$$L_{ISS}(S_i, t_i, \delta_i) = \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau, \delta=1)}{G(t_i)} + \frac{(1-S_i(\tau)) \text{I}(t_i > \tau)}{G(\tau)} \ d\tau$$

where \(G\) is the Kaplan-Meier estimate of the censoring distribution.

The re-weighted ISS (RISS) is:

-

$$L_{RISS}(S_i, t_i, \delta_i) = \delta_i \text{I}(t_i \leq \tau^*) \int^{\tau^*}_0 \frac{S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau)}{G(t_i)} \ d\tau$$

+

$$L_{RISS}(S_i, t_i, \delta_i) = \delta_i \frac{\int^{\tau^*}_0 S_i(\tau) \text{I}(t_i \leq \tau) + (1-S_i(\tau)) \text{I}(t_i > \tau) \ d\tau}{G(t_i)}$$

which is always weighted by \(G(t_i)\) and is equal to zero for a censored subject.

To get a single score across all \(N\) observations of the test set, we return the average of the time-integrated observation-wise scores: $$\sum_{i=1}^N L(S_i, t_i, \delta_i) / N$$

-

$$L_{ISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t^*))]$$ -where \(G\) is the Kaplan-Meier estimate of the censoring distribution.

-

The re-weighted ISS, RISS is given by -$$L_{RISS}(S,t|t^*) = [(S(t^*))I(t \le t^*, \delta = 1)(1/G(t))] + [((1 - S(t^*)))I(t > t^*)(1/G(t))]$$

Dictionary

@@ -89,7 +85,7 @@

Dictionary

Parameters

-
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-

+
IdTypeDefaultLevelsRange
integratedlogicalTRUETRUE, FALSE-
timesuntyped--
t_maxnumeric-\([0, \infty)\)
p_maxnumeric-\([0, 1]\)
methodinteger2\([1, 2]\)
selogicalFALSETRUE, FALSE-
properlogicalFALSETRUE, FALSE-
epsnumeric0.001\([0, 1]\)
ERVlogicalFALSETRUE, FALSE-
remove_obslogicalFALSETRUE, FALSE-

Properness

- +

[Experimental]

RISS is strictly proper when the censoring distribution is independent of the survival distribution and when \(G(t)\) is fit on a sufficiently large dataset. ISS is never proper. Use proper = FALSE for ISS and proper = TRUE for RISS. Results may be very different if many observations are censored at the last observed time due to division by \(1/eps\) in proper = TRUE.

+

See Sonabend et al. (2024) for more details. +The use of proper = TRUE is considered experimental and should be used with caution.

Time points used for evaluation

@@ -199,23 +201,36 @@

Data used for Estimatin -

If task and train_set are passed to $score then \(G(t)\) is fit on training data, -otherwise testing data. The first is likely to reduce any bias caused by calculating -parts of the measure on the test data it is evaluating. The training data is automatically -used in scoring resamplings.

+

If task and train_set are passed to $score then \(G(t)\) is fit using +all observations from the train set, otherwise the test set is used. +Using the train set is likely to reduce any bias caused by calculating parts of the +measure on the test data it is evaluating. +Also usually it means that more data is used for fitting the censoring +distribution \(G(t)\) via the Kaplan-Meier. +The training data is automatically used in scoring resamplings.

Time Cutoff Details

-

If t_max or p_max is given, then \(G(t)\) will be fitted using all observations from the -train set (or test set) and only then the cutoff time will be applied. -This is to ensure that more data is used for fitting the censoring distribution via the -Kaplan-Meier. -Setting the t_max can help alleviate inflation of the score when proper is TRUE, -in cases where an observation is censored at the last observed time point. -This results in \(G(t_{max}) = 0\) and the use of eps instead (when t_max is NULL).

+

If t_max or p_max is given, then the predicted survival function \(S(t)\) is +truncated at the time cutoff for all observations.

+

[Experimental]

+

Also, if remove_obs = TRUE, observations with observed times \(t > t_{max}\) are removed. +This data preprocessing step mitigates issues that arise when using IPCW +in cases of administrative censoring, see Kvamme et al. (2023). +Practically, this step, along with setting a time cutoff t_max, helps mitigate +the inflation of the score observed when an observation is censored at the +final time point. In such cases, \(G(t) = 0\), triggering the use of a +small constant eps instead. +This inflation particularly impacts the proper version of the score, see Sonabend et al. (2024) +for more details. +Note that the t_max and remove_obs parameters do not affect the estimation +of the censoring distribution, i.e. always all the observations are used for estimating \(G(t)\).

+

If remove_obs = FALSE, inflated scores may occur. While this aligns more closely +with the definitions presented in the original papers, it can lead to misleading +evaluation and poor optimization outcomes when using this score for model tuning.

References

@@ -229,6 +244,13 @@

Referencesdoi:10.1111/j.1541-0420.2010.01459.x .

+

Sonabend, Raphael, Zobolas, John, Kopper, Philipp, Burk, Lukas, Bender, Andreas (2024). +“Examining properness in the external validation of survival models with squared and logarithmic losses.” +https://arxiv.org/abs/2212.05260v2.

+

Kvamme, Havard, Borgan, Ornulf (2023). +“The Brier Score under Administrative Censoring: Problems and a Solution.” +Journal of Machine Learning Research, 24(2), 1–26. +ISSN 1533-7928, http://jmlr.org/papers/v24/19-1030.html.

See also

@@ -323,6 +345,70 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# ISS, G(t) calculated using the test set
+p$score(msr("surv.schmid"))
+#> surv.schmid 
+#>   0.2901703 
+
+# ISS, G(t) calculated using the train set (always recommended)
+p$score(msr("surv.schmid"), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.2568166 
+
+# ISS, ERV score (comparing with KM baseline)
+p$score(msr("surv.schmid", ERV = TRUE), task = task, train_set = part$train)
+#> surv.schmid 
+#>  0.01762828 
+
+# ISS at specific time point
+p$score(msr("surv.schmid", times = 365), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.4244478 
+
+# ISS at multiple time points (integrated)
+p$score(msr("surv.schmid", times = c(125, 365, 450), integrated = TRUE), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.3551183 
+
+# ISS, use time cutoff
+p$score(msr("surv.schmid", t_max = 700), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.3236368 
+
+# ISS, use time cutoff and also remove observations
+p$score(msr("surv.schmid", t_max = 700, remove_obs = TRUE), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.2893284 
+
+# ISS, use time cutoff corresponding to specific proportion of censoring on the test set
+p$score(msr("surv.schmid", p_max = 0.8), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.3300005 
+
+# RISS, G(t) calculated using the train set
+p$score(msr("surv.schmid", proper = TRUE), task = task, train_set = part$train)
+#> surv.schmid 
+#>   0.2003059 
+
+
+

diff --git a/reference/mlr_measures_surv.song_auc.html b/reference/mlr_measures_surv.song_auc.html index adbab2ca..d704530e 100644 --- a/reference/mlr_measures_surv.song_auc.html +++ b/reference/mlr_measures_surv.song_auc.html @@ -202,6 +202,40 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.song_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc 
+#>      0.633294 
+
+# AUC at specific time point
+p$score(msr("surv.song_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc 
+#>     0.6156616 
+
+# Integrated AUC at specific time points
+p$score(msr("surv.song_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.song_auc 
+#>      0.631936 
+
+
+ diff --git a/reference/mlr_measures_surv.uno_auc.html b/reference/mlr_measures_surv.uno_auc.html index 9b46cbbe..36596396 100644 --- a/reference/mlr_measures_surv.uno_auc.html +++ b/reference/mlr_measures_surv.uno_auc.html @@ -200,6 +200,40 @@

Arguments +

Examples

+
library(mlr3)
+
+# Define a survival Task
+task = tsk("lung")
+
+# Create train and test set
+part = partition(task)
+
+# Train Cox learner on the train set
+cox = lrn("surv.coxph")
+cox$train(task, row_ids = part$train)
+
+# Make predictions for the test set
+p = cox$predict(task, row_ids = part$test)
+
+# Integrated AUC score
+p$score(msr("surv.uno_auc"), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc 
+#>    0.7062886 
+
+# AUC at specific time point
+p$score(msr("surv.uno_auc", times = 600), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc 
+#>     0.645577 
+
+# Integrated AUC at specific time points
+p$score(msr("surv.uno_auc", times = c(100, 200, 300, 400, 500)), task = task, train_set = part$train, learner = cox)
+#> surv.uno_auc 
+#>    0.6787879 
+
+
+ diff --git a/reference/mlr_task_generators_coxed.html b/reference/mlr_task_generators_coxed.html index 28480665..1ae6d5ea 100644 --- a/reference/mlr_task_generators_coxed.html +++ b/reference/mlr_task_generators_coxed.html @@ -174,7 +174,7 @@

Examples # same as above, but with time-varying coefficients (counting process format) gen$param_set$set_values(type = "tvc") gen$generate(50) -#> <TaskSurv:coxed> (5319 x 10) +#> <TaskSurv:coxed> (3604 x 10) #> * Target: start, end, failed #> * Properties: - #> * Features (7): diff --git a/reference/mlr_task_generators_simdens.html b/reference/mlr_task_generators_simdens.html index 5ca6527d..1cefd969 100644 --- a/reference/mlr_task_generators_simdens.html +++ b/reference/mlr_task_generators_simdens.html @@ -150,19 +150,19 @@

Exampleshead(task) #> x #> <num> -#> 1: 1.9231091 -#> 2: 0.7656584 -#> 3: 0.1113752 -#> 4: -2.4873272 -#> 5: -0.6934592 -#> 6: -2.0700736 +#> 1: -1.6460799 +#> 2: -0.3076342 +#> 3: 0.5160393 +#> 4: 1.2509043 +#> 5: 0.2014329 +#> 6: 2.7219002 # generate 50 samples from a Binomial distribution with specific parameters dens_gen = tgen("simdens", distribution = "Bernoulli", pars = list(prob = 0.8)) task = dens_gen$generate(50) task$data()[["x"]] -#> [1] 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 -#> [39] 1 1 1 1 0 1 1 1 0 1 1 1 +#> [1] 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1 +#> [39] 1 1 1 1 1 1 1 1 1 1 0 1