From 032c4b46360a19d71e7ef51ca25190e7de449c0d Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:18:26 +0100 Subject: [PATCH 01/20] add assert_scorable() to NAMESPACE --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index eadee06bf..8946cdd5e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -211,6 +211,7 @@ export(assert_resample_result) export(assert_resampling) export(assert_resamplings) export(assert_row_ids) +export(assert_scorable) export(assert_task) export(assert_tasks) export(assert_validate) From b27150261af7c384ac99a4f71c0ce35e823f2cbc Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:18:51 +0100 Subject: [PATCH 02/20] add offset to reflections --- R/mlr_reflections.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R index 283562658..4a7fc4886 100644 --- a/R/mlr_reflections.R +++ b/R/mlr_reflections.R @@ -94,14 +94,14 @@ local({ "use" ) - tmp = c("feature", "target", "name", "order", "stratum", "group", "weight") + tmp = c("feature", "target", "name", "order", "stratum", "group", "weight", "offset") mlr_reflections$task_col_roles = list( regr = tmp, classif = tmp, unsupervised = c("feature", "name", "order") ) - tmp = c("strata", "groups", "weights") + tmp = c("strata", "groups", "weights", "offset") mlr_reflections$task_properties = list( classif = c(tmp, "twoclass", "multiclass"), regr = tmp, @@ -114,7 +114,7 @@ local({ mlr_reflections$task_print_col_roles = list( before = character(), - after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight") + after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight", "Offset" = "offset") ) ### Learner From 76ae35a916f54498ba1019aea71a2b8f10c79938 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:23:00 +0100 Subject: [PATCH 03/20] add offset col_role to Task --- R/Task.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/R/Task.R b/R/Task.R index a081f15ca..7ef0c8c42 100644 --- a/R/Task.R +++ b/R/Task.R @@ -900,6 +900,7 @@ Task = R6Class("Task", #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`). #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`). #' * `"weights"`: The task comes with observation weights (role `"weight"`). + #' * `"offset"`: The task includes an offset column specifying fixed adjustments for model training (role `"offset"`). #' #' Note that above listed properties are calculated from the `$col_roles` and may not be set explicitly. properties = function(rhs) { @@ -909,7 +910,8 @@ Task = R6Class("Task", private$.properties, if (length(col_roles$group)) "groups" else NULL, if (length(col_roles$stratum)) "strata" else NULL, - if (length(col_roles$weight)) "weights" else NULL + if (length(col_roles$weight)) "weights" else NULL, + if (length(col_roles$offset)) "offset" else NULL ) } else { private$.properties = assert_set(rhs, .var.name = "properties") @@ -953,6 +955,8 @@ Task = R6Class("Task", #' Not more than a single column can be associated with this role. #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role. #' * `"weight"`: Observation weights. Not more than one numeric column may have this role. + #' * `"offset"`: Offset values specifying fixed adjustments for model training. These values can be used to provide baseline predictions from an existing model for updating another model. + #' Not more than one numeric column may have this role. #' #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names. #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots). @@ -1232,7 +1236,7 @@ task_check_col_roles = function(task, new_roles, ...) { #' @rdname task_check_col_roles #' @export task_check_col_roles.Task = function(task, new_roles, ...) { - for (role in c("group", "weight", "name")) { + for (role in c("group", "weight", "name", "offset")) { if (length(new_roles[[role]]) > 1L) { stopf("There may only be up to one column with role '%s'", role) } @@ -1252,6 +1256,12 @@ task_check_col_roles.Task = function(task, new_roles, ...) { } } + # check offset + if (length(new_roles[["offset"]])) { + offset = task$backend$data(task$backend$rownames, cols = new_roles[["offset"]]) + assert_numeric(offset[[1L]], any.missing = FALSE) + } + return(new_roles) } From 0f1b4654ef04b7d8911da02d65db3c7811216400 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:23:18 +0100 Subject: [PATCH 04/20] update doc --- man/Task.Rd | 3 +++ man/mlr3-package.Rd | 1 + man/mlr_assertions.Rd | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/man/Task.Rd b/man/Task.Rd index 4e925c1ad..b71e5266f 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -198,6 +198,7 @@ The following properties are currently standardized and understood by tasks in \ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). \item \code{"weights"}: The task comes with observation weights (role \code{"weight"}). +\item \code{"offset"}: The task includes an offset column specifying fixed adjustments for model training (role \code{"offset"}). } Note that above listed properties are calculated from the \verb{$col_roles} and may not be set explicitly.} @@ -225,6 +226,8 @@ For each resampling iteration, observations of the same group will be exclusivel Not more than a single column can be associated with this role. \item \code{"stratum"}: Stratification variables. Multiple discrete columns may have this role. \item \code{"weight"}: Observation weights. Not more than one numeric column may have this role. +\item \code{"offset"}: Offset values specifying fixed adjustments for model training. These values can be used to provide baseline predictions from an existing model for updating another model. +Not more than one numeric column may have this role. } \code{col_roles} is a named list whose elements are named by column role and each element is a \code{character()} vector of column names. diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd index fb5185e56..a1d39bfcb 100644 --- a/man/mlr3-package.Rd +++ b/man/mlr3-package.Rd @@ -100,6 +100,7 @@ Authors: \item Florian Pfisterer \email{pfistererf@googlemail.com} (\href{https://orcid.org/0000-0001-8867-762X}{ORCID}) \item Raphael Sonabend \email{raphaelsonabend@gmail.com} (\href{https://orcid.org/0000-0001-9225-4654}{ORCID}) \item Sebastian Fischer \email{sebf.fischer@gmail.com} (\href{https://orcid.org/0000-0002-9609-3197}{ORCID}) + \item John Zobolas \email{bblodfon@gmail.com} (\href{https://orcid.org/0000-0002-3609-8674}{ORCID}) } Other contributors: diff --git a/man/mlr_assertions.Rd b/man/mlr_assertions.Rd index 5f7c380d1..21a497c43 100644 --- a/man/mlr_assertions.Rd +++ b/man/mlr_assertions.Rd @@ -10,6 +10,7 @@ \alias{assert_learnable} \alias{assert_predictable} \alias{assert_measure} +\alias{assert_scorable} \alias{assert_measures} \alias{assert_resampling} \alias{assert_resamplings} @@ -67,6 +68,14 @@ assert_measure( .var.name = vname(measure) ) +assert_scorable( + measure, + task, + learner, + prediction = NULL, + .var.name = vname(measure) +) + assert_measures( measures, task = NULL, From b43a1e481dcf4cc10763b01ad64490a617615812 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:23:49 +0100 Subject: [PATCH 05/20] add test for offset --- tests/testthat/test_Task.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index 07cb6b040..03ba50853 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -247,16 +247,19 @@ test_that("stratify works", { expect_list(tab$row_id, "integer") }) -test_that("groups/weights work", { - b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), g = sample(letters[1:2], 20, replace = TRUE))) +test_that("groups/weights/offset work", { + b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), + o = runif(20), g = sample(letters[1:2], 20, replace = TRUE))) task = TaskRegr$new("test", b, target = "y") task$set_row_roles(16:20, character()) expect_false("groups" %in% task$properties) expect_false("weights" %in% task$properties) + expect_false("offset" %in% task$properties) expect_null(task$groups) expect_null(task$weights) + # weight task$col_roles$weight = "w" expect_subset("weights", task$properties) expect_data_table(task$weights, ncols = 2, nrows = 15) @@ -265,6 +268,7 @@ test_that("groups/weights work", { task$col_roles$weight = character() expect_true("weights" %nin% task$properties) + # group task$col_roles$group = "g" expect_subset("groups", task$properties) expect_data_table(task$groups, ncols = 2, nrows = 15) @@ -276,6 +280,15 @@ test_that("groups/weights work", { expect_error({ task$col_roles$weight = c("w", "g") }, "up to one") + + # offset + task$col_roles$offset = "o" + expect_subset("offset", task$properties) + expect_error({ + task$col_roles$offset = c("o", "w") + }, "up to one") + task$col_roles$offset = character() + expect_true("offset" %nin% task$properties) }) test_that("col roles are valid", { From aaf3f38f2f2dcf5cbc7c65795d764d69c905ee54 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:24:29 +0100 Subject: [PATCH 06/20] add John as ctb --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3b18bc718..8b61ae016 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,9 @@ Authors@R: comment = c(ORCID = "0000-0002-8115-0400")), person("Sebastian", "Fischer", , "sebf.fischer@gmail.com", role = "aut", comment = c(ORCID = "0000-0002-9609-3197")), - person("Lona", "Koers", , "lona.koers@gmail.com", role = "ctb") + person("Lona", "Koers", , "lona.koers@gmail.com", role = "ctb"), + person("John", "Zobolas", , "bblodfon@gmail.com", role = "ctb", + comment = c(ORCID = "0000-0002-3609-8674")) ) Description: Efficient, object-oriented programming on the building blocks of machine learning. Provides 'R6' objects for tasks, From dbc591dd6f4b35d088314240b8c74e369ea72647 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 4 Dec 2024 17:25:34 +0100 Subject: [PATCH 07/20] update news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 83d6dfbb0..9a41ae674 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # mlr3 (development version) +* feat: add new `col_role` offset in `Task`. + # mlr3 0.22.1 * fix: Extend `assert_measure()` with checks for trained models in `assert_scorable()`. From dbe05cc2259b209d3942dcf161fca666fabc549b Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 11:23:38 +0100 Subject: [PATCH 08/20] ... --- R/Task.R | 17 +++++++++++++---- R/mlr_reflections.R | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/R/Task.R b/R/Task.R index 66b7cafc5..f68f80d33 100644 --- a/R/Task.R +++ b/R/Task.R @@ -1250,6 +1250,11 @@ task_check_col_roles.Task = function(task, new_roles, ...) { } } + # check offset + if (length(new_roles[["offset"]]) && any(fget(task$col_info, new_roles[["offset"]], "type", key = "id") %nin% c("numeric", "integer"))) { + stopf("Offset column(s) %s must be a numeric or integer column", paste0("'", new_roles[["offset"]], "'", collapse = ",")) + } + return(new_roles) } @@ -1266,16 +1271,20 @@ task_check_col_roles.TaskClassif = function(task, new_roles, ...) { stopf("Target column(s) %s must be a factor or ordered factor", paste0("'", new_roles[["target"]], "'", collapse = ",")) } + if (length(new_roles[["offset"]]) > 1L && length(task$class_names) == 2L) { + stop("There may only be up to one column with role 'offset' for binary classification tasks") + } + NextMethod() } #' @rdname task_check_col_roles #' @export task_check_col_roles.TaskRegr = function(task, new_roles, ...) { - - # check target - if (length(new_roles[["target"]]) > 1L) { - stopf("There may only be up to one column with role 'target'") + for (role in c("target", "offset")) { + if (length(new_roles[[role]]) > 1L) { + stopf("There may only be up to one column with role '%s'", role) + } } if (length(new_roles[["target"]]) && any(fget(task$col_info, new_roles[["target"]], "type", key = "id") %nin% c("numeric", "integer"))) { diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R index 4ac532741..079181d48 100644 --- a/R/mlr_reflections.R +++ b/R/mlr_reflections.R @@ -94,7 +94,7 @@ local({ "use" ) - tmp = c("feature", "target", "name", "order", "stratum", "group", "weight") + tmp = c("feature", "target", "name", "order", "stratum", "group", "weight", "offset") mlr_reflections$task_col_roles = list( regr = tmp, classif = tmp, From d88e493e6b6a4ea4b07472d61aaf3bc0fd438668 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 11:58:24 +0100 Subject: [PATCH 09/20] ... --- NEWS.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index c703908c9..156b8b111 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,15 +1,12 @@ # mlr3 (development version) -<<<<<<< HEAD * feat: add new `col_role` offset in `Task`. -======= * fix: the `$predict_newdata()` method of `Learner` now automatically conducts type conversions (#685) -* BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. +* BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. * Column names with UTF-8 characters are now allowed by default. The option `mlr3.allow_utf8_names` is removed. * BREAKING CHANGE: `Learner$predict_types` is read-only now. * docs: Clear up behavior of `Learner$predict_type` after training. ->>>>>>> main # mlr3 0.22.1 From afce34cd6e5124b41c4ca931097c81cdf09fbdb9 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 11:59:25 +0100 Subject: [PATCH 10/20] ... --- R/Task.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Task.R b/R/Task.R index ef9e77391..70d874dca 100644 --- a/R/Task.R +++ b/R/Task.R @@ -1234,7 +1234,7 @@ task_check_col_roles = function(task, new_roles, ...) { #' @rdname task_check_col_roles #' @export task_check_col_roles.Task = function(task, new_roles, ...) { - for (role in c("group", "weight", "name", "offset")) { + for (role in c("group", "weight", "name")) { if (length(new_roles[[role]]) > 1L) { stopf("There may only be up to one column with role '%s'", role) } From a8e0469e078affad862f5523152da7094b8bd9e0 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 15:34:14 +0100 Subject: [PATCH 11/20] ... --- R/Task.R | 5 +++++ tests/testthat/test_Task.R | 12 ++---------- tests/testthat/test_TaskClassif.R | 30 ++++++++++++++++++++++++++++++ tests/testthat/test_TaskRegr.R | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/R/Task.R b/R/Task.R index 70d874dca..8dc93e4df 100644 --- a/R/Task.R +++ b/R/Task.R @@ -1279,6 +1279,11 @@ task_check_col_roles.TaskClassif = function(task, new_roles, ...) { stop("There may only be up to one column with role 'offset' for binary classification tasks") } + if (length(new_roles[["offset"]]) > 1L) { + expected_names = paste0("offset_", task$class_names) + expect_subset(new_roles[["offset"]], expected_names) + } + NextMethod() } diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index e82850dd6..b039a0495 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -247,7 +247,7 @@ test_that("stratify works", { expect_list(tab$row_id, "integer") }) -test_that("groups/weights/offset work", { +test_that("groups/weights work", { b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), o = runif(20), g = sample(letters[1:2], 20, replace = TRUE))) task = TaskRegr$new("test", b, target = "y") @@ -280,15 +280,6 @@ test_that("groups/weights/offset work", { expect_error({ task$col_roles$weight = c("w", "g") }, "up to one") - - # offset - task$col_roles$offset = "o" - expect_subset("offset", task$properties) - expect_error({ - task$col_roles$offset = c("o", "w") - }, "up to one") - task$col_roles$offset = character() - expect_true("offset" %nin% task$properties) }) test_that("col roles are valid", { @@ -739,3 +730,4 @@ test_that("warn when internal valid task has 0 obs", { task = tsk("iris") expect_warning({task$internal_valid_task = 151}, "has 0 observations") }) + diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R index cd00edb5b..4b1ccf8ec 100644 --- a/tests/testthat/test_TaskClassif.R +++ b/tests/testthat/test_TaskClassif.R @@ -112,3 +112,33 @@ test_that("target is encoded as factor (#629)", { dt$target = ordered(dt$target) TaskClassif$new(id = "XX", backend = dt, target = "target") }) + +test_that("offset column role works with binary tasks", { + task = tsk("pima") + task$set_col_roles("glucose", "offset") + + expect_subset("offset", task$properties) + + expect_error({ + task$col_roles$offset = c("glucose", "diabetes") + }, "There may only be up to one column with role") +}) + +test_that("offset column role works with multiclass tasks", { + task = tsk("penguins") + task$set_col_roles("body_mass", "offset") + expect_subset("offset", task$properties) + + expect_error({ + task$col_roles$offset = c("body_mass", "flipper_length") + }, "Must be a subset of") + + task = tsk("penguins") + data = task$data() + set(data, j = "offset_Adelie", value = runif(nrow(data))) + set(data, j = "offset_Chinstrap", value = runif(nrow(data))) + task = as_task_classif(data, target = "species") + task$set_col_roles(c("offset_Adelie", "offset_Chinstrap"), "offset") + + expect_subset("offset", task$properties) +}) diff --git a/tests/testthat/test_TaskRegr.R b/tests/testthat/test_TaskRegr.R index 29ade6eae..542d61e6e 100644 --- a/tests/testthat/test_TaskRegr.R +++ b/tests/testthat/test_TaskRegr.R @@ -49,3 +49,18 @@ test_that("$add_strata", { task$add_strata(task$target_names, bins = 2) expect_identical(task$strata$N, c(50L, 10L)) }) + +test_that("offset column role works", { + task = tsk("mtcars") + task$set_col_roles("am", "offset") + + expect_subset("offset", task$properties) + + expect_error({ + task$col_roles$offset = c("am", "gear") + }, "up to one") + + + task$col_roles$offset = character() + expect_true("offset" %nin% task$properties) +}) From 339c4ab134de414371e715657a11cd17e39b1e30 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 15:39:10 +0100 Subject: [PATCH 12/20] ... --- R/Task.R | 6 ++++-- man/Task.Rd | 9 ++++----- man/mlr3-package.Rd | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/R/Task.R b/R/Task.R index 8dc93e4df..f98475a42 100644 --- a/R/Task.R +++ b/R/Task.R @@ -953,8 +953,10 @@ Task = R6Class("Task", #' Not more than a single column can be associated with this role. #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role. #' * `"weight"`: Observation weights. Not more than one numeric column may have this role. - #' * `"offset"`: Offset values specifying fixed adjustments for model training. These values can be used to provide baseline predictions from an existing model for updating another model. - #' Not more than one numeric column may have this role. + #' * `"offset"`: Offset values specifying fixed adjustments for model training. + #' These values can be used to provide baseline predictions from an existing model for updating another model. + #' Some learners require an offset for each target class in a multiclass setting. + #' In this case, the offset columns must be named `"offset_target_class"`. #' #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names. #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots). diff --git a/man/Task.Rd b/man/Task.Rd index ea6821eba..71dd4c550 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -198,11 +198,8 @@ The following properties are currently standardized and understood by tasks in \ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). \item \code{"weights"}: The task comes with observation weights (role \code{"weight"}). -<<<<<<< HEAD \item \code{"offset"}: The task includes an offset column specifying fixed adjustments for model training (role \code{"offset"}). -======= \item \code{"ordered"}: The task has columns which define the row order (role \code{"order"}). ->>>>>>> main } Note that above listed properties are calculated from the \verb{$col_roles} and may not be set explicitly.} @@ -230,8 +227,10 @@ For each resampling iteration, observations of the same group will be exclusivel Not more than a single column can be associated with this role. \item \code{"stratum"}: Stratification variables. Multiple discrete columns may have this role. \item \code{"weight"}: Observation weights. Not more than one numeric column may have this role. -\item \code{"offset"}: Offset values specifying fixed adjustments for model training. These values can be used to provide baseline predictions from an existing model for updating another model. -Not more than one numeric column may have this role. +\item \code{"offset"}: Offset values specifying fixed adjustments for model training. +These values can be used to provide baseline predictions from an existing model for updating another model. +Some learners require an offset for each target class in a multiclass setting. +In this case, the offset columns must be named \code{"offset_target_class"}. } \code{col_roles} is a named list whose elements are named by column role and each element is a \code{character()} vector of column names. diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd index 7440a56e5..71e978ee8 100644 --- a/man/mlr3-package.Rd +++ b/man/mlr3-package.Rd @@ -96,7 +96,6 @@ Authors: \item Florian Pfisterer \email{pfistererf@googlemail.com} (\href{https://orcid.org/0000-0001-8867-762X}{ORCID}) \item Raphael Sonabend \email{raphaelsonabend@gmail.com} (\href{https://orcid.org/0000-0001-9225-4654}{ORCID}) \item Sebastian Fischer \email{sebf.fischer@gmail.com} (\href{https://orcid.org/0000-0002-9609-3197}{ORCID}) - \item John Zobolas \email{bblodfon@gmail.com} (\href{https://orcid.org/0000-0002-3609-8674}{ORCID}) } Other contributors: @@ -106,6 +105,7 @@ Other contributors: \item Quay Au \email{quayau@gmail.com} (\href{https://orcid.org/0000-0002-5252-8902}{ORCID}) [contributor] \item Lennart Schneider \email{lennart.sch@web.de} (\href{https://orcid.org/0000-0003-4152-5308}{ORCID}) [contributor] \item Lona Koers \email{lona.koers@gmail.com} [contributor] + \item John Zobolas \email{bblodfon@gmail.com} (\href{https://orcid.org/0000-0002-3609-8674}{ORCID}) [contributor] } } From fd729db8cf4001edf7d8f963c6c11840cc79f8f9 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 16 Jan 2025 15:40:28 +0100 Subject: [PATCH 13/20] ... --- tests/testthat/test_Task.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index b039a0495..ab084c430 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -253,9 +253,9 @@ test_that("groups/weights work", { task = TaskRegr$new("test", b, target = "y") task$set_row_roles(16:20, character()) - expect_false("groups" %cin% task$properties) - expect_false("weights" %cin% task$properties) - expect_false("offset" %cin% task$properties) + expect_false("groups" %chin% task$properties) + expect_false("weights" %chin% task$properties) + expect_false("offset" %chin% task$properties) expect_null(task$groups) expect_null(task$weights) From 3943c9ee1685004d85935f2e871a5bc237581546 Mon Sep 17 00:00:00 2001 From: be-marc Date: Mon, 20 Jan 2025 14:01:56 +0100 Subject: [PATCH 14/20] ... --- R/Task.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Task.R b/R/Task.R index f98475a42..9bdae0476 100644 --- a/R/Task.R +++ b/R/Task.R @@ -496,7 +496,7 @@ Task = R6Class("Task", } # columns with these roles must be present in data - mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order") + mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order", "offset") mandatory_cols = unlist(private$.col_roles[mandatory_roles], use.names = FALSE) missing_cols = setdiff(mandatory_cols, data$colnames) if (length(missing_cols)) { From caae259373bcca1636f1f54c2da396000f528fc2 Mon Sep 17 00:00:00 2001 From: be-marc Date: Mon, 20 Jan 2025 14:25:57 +0100 Subject: [PATCH 15/20] ... --- R/Task.R | 8 +++++++- tests/testthat/test_TaskClassif.R | 26 ++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/R/Task.R b/R/Task.R index 9bdae0476..1ffa968b8 100644 --- a/R/Task.R +++ b/R/Task.R @@ -1261,6 +1261,12 @@ task_check_col_roles.Task = function(task, new_roles, ...) { stopf("Offset column(s) %s must be a numeric or integer column", paste0("'", new_roles[["offset"]], "'", collapse = ",")) } + if (any(task$missings(cols = new_roles[["offset"]]) > 0)) { + missings = task$missings(cols = new_roles[["offset"]]) + missings = names(missings[missings > 0]) + stopf("Offset column(s) %s contain missing values", paste0("'", missings, "'", collapse = ",")) + } + return(new_roles) } @@ -1283,7 +1289,7 @@ task_check_col_roles.TaskClassif = function(task, new_roles, ...) { if (length(new_roles[["offset"]]) > 1L) { expected_names = paste0("offset_", task$class_names) - expect_subset(new_roles[["offset"]], expected_names) + expect_subset(new_roles[["offset"]], expected_names, label = "col_roles") } NextMethod() diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R index 4b1ccf8ec..da6c09d30 100644 --- a/tests/testthat/test_TaskClassif.R +++ b/tests/testthat/test_TaskClassif.R @@ -115,30 +115,40 @@ test_that("target is encoded as factor (#629)", { test_that("offset column role works with binary tasks", { task = tsk("pima") - task$set_col_roles("glucose", "offset") + task$set_col_roles("age", "offset") expect_subset("offset", task$properties) expect_error({ task$col_roles$offset = c("glucose", "diabetes") }, "There may only be up to one column with role") + + expect_error({ + task$col_roles$offset = c("glucose") + }, "contain missing values") }) test_that("offset column role works with multiclass tasks", { task = tsk("penguins") - task$set_col_roles("body_mass", "offset") + task$set_col_roles("year", "offset") expect_subset("offset", task$properties) expect_error({ - task$col_roles$offset = c("body_mass", "flipper_length") + task$col_roles$offset = "bill_length" + }, "contain missing values") + + task = tsk("wine") + + expect_error({ + task$col_roles$offset = c("alcohol", "ash") }, "Must be a subset of") - task = tsk("penguins") + task = tsk("wine") data = task$data() - set(data, j = "offset_Adelie", value = runif(nrow(data))) - set(data, j = "offset_Chinstrap", value = runif(nrow(data))) - task = as_task_classif(data, target = "species") - task$set_col_roles(c("offset_Adelie", "offset_Chinstrap"), "offset") + set(data, j = "offset_1", value = runif(nrow(data))) + set(data, j = "offset_2", value = runif(nrow(data))) + task = as_task_classif(data, target = "type") + task$set_col_roles(c("offset_1", "offset_2"), "offset") expect_subset("offset", task$properties) }) From a24b6ac3ebdecc90103f3c31b09646b077c3ba2c Mon Sep 17 00:00:00 2001 From: john Date: Wed, 22 Jan 2025 15:35:54 +0100 Subject: [PATCH 16/20] add offset learner property --- R/mlr_reflections.R | 2 +- tests/testthat/test_TaskRegr.R | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R index 368de4add..7d6ae5fec 100644 --- a/R/mlr_reflections.R +++ b/R/mlr_reflections.R @@ -118,7 +118,7 @@ local({ ) ### Learner - tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal") + tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal", "offset") mlr_reflections$learner_properties = list( classif = c(tmp, "twoclass", "multiclass"), regr = tmp diff --git a/tests/testthat/test_TaskRegr.R b/tests/testthat/test_TaskRegr.R index 542d61e6e..ab2c36d7c 100644 --- a/tests/testthat/test_TaskRegr.R +++ b/tests/testthat/test_TaskRegr.R @@ -60,7 +60,6 @@ test_that("offset column role works", { task$col_roles$offset = c("am", "gear") }, "up to one") - task$col_roles$offset = character() expect_true("offset" %nin% task$properties) }) From 0806838b0bb5d647b5ede6b5e3b06bc7739e9c62 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 22 Jan 2025 16:16:29 +0100 Subject: [PATCH 17/20] add offset field + tests + doc --- R/Task.R | 21 +++++++++++++++++++-- man/Task.Rd | 12 ++++++++++-- man/as_resampling.Rd | 2 +- man/as_task.Rd | 2 +- tests/testthat/test_TaskClassif.R | 10 +++++++++- tests/testthat/test_TaskRegr.R | 4 ++++ 6 files changed, 44 insertions(+), 7 deletions(-) diff --git a/R/Task.R b/R/Task.R index 1ffa968b8..b11b196e8 100644 --- a/R/Task.R +++ b/R/Task.R @@ -896,7 +896,7 @@ Task = R6Class("Task", #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`). #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`). #' * `"weights"`: The task comes with observation weights (role `"weight"`). - #' * `"offset"`: The task includes an offset column specifying fixed adjustments for model training (role `"offset"`). + #' * `"offset"`: The task includes one or more offset columns specifying fixed adjustments for model training and possibly for prediction (role `"offset"`). #' * `"ordered"`: The task has columns which define the row order (role `"order"`). #' #' Note that above listed properties are calculated from the `$col_roles` and may not be set explicitly. @@ -956,7 +956,7 @@ Task = R6Class("Task", #' * `"offset"`: Offset values specifying fixed adjustments for model training. #' These values can be used to provide baseline predictions from an existing model for updating another model. #' Some learners require an offset for each target class in a multiclass setting. - #' In this case, the offset columns must be named `"offset_target_class"`. + #' In this case, the offset columns must be named `"offset_{target_class_name}"`. #' #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names. #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots). @@ -1090,6 +1090,23 @@ Task = R6Class("Task", setnames(data, c("row_id", "weight"))[] }, + #' @field offset ([data.table::data.table()])\cr + #' Provides the offset column(s) if the task has a column designated with the role `"offset"`. + #' + #' For regression or binary classification tasks, this returns a single-column offset. + #' For multiclass tasks, it may return multiple offset columns, one for each target class. + #' + #' If there are no columns with the `"offset"` role, `NULL` is returned. + offset = function(rhs) { + assert_has_backend(self) + assert_ro_binding(rhs) + offset_cols = private$.col_roles$offset + if (length(offset_cols) == 0L) { + return(NULL) + } + + self$backend$data(private$.row_roles$use, offset_cols) + }, #' @field labels (named `character()`)\cr #' Retrieve `labels` (prettier formated names) from columns. diff --git a/man/Task.Rd b/man/Task.Rd index 71dd4c550..b8d8daf43 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -198,7 +198,7 @@ The following properties are currently standardized and understood by tasks in \ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). \item \code{"weights"}: The task comes with observation weights (role \code{"weight"}). -\item \code{"offset"}: The task includes an offset column specifying fixed adjustments for model training (role \code{"offset"}). +\item \code{"offset"}: The task includes one or more offset columns specifying fixed adjustments for model training and possibly for prediction (role \code{"offset"}). \item \code{"ordered"}: The task has columns which define the row order (role \code{"order"}). } @@ -230,7 +230,7 @@ Not more than a single column can be associated with this role. \item \code{"offset"}: Offset values specifying fixed adjustments for model training. These values can be used to provide baseline predictions from an existing model for updating another model. Some learners require an offset for each target class in a multiclass setting. -In this case, the offset columns must be named \code{"offset_target_class"}. +In this case, the offset columns must be named \code{"offset_{target_class_name}"}. } \code{col_roles} is a named list whose elements are named by column role and each element is a \code{character()} vector of column names. @@ -291,6 +291,14 @@ If the task has a column with designated role \code{"weight"}, a table with two Returns \code{NULL} if there are is no weight column.} +\item{\code{offset}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr +Provides the offset column(s) if the task has a column designated with the role \code{"offset"}. + +For regression or binary classification tasks, this returns a single-column offset. +For multiclass tasks, it may return multiple offset columns, one for each target class. + +If there are no columns with the \code{"offset"} role, \code{NULL} is returned.} + \item{\code{labels}}{(named \code{character()})\cr Retrieve \code{labels} (prettier formated names) from columns. Internally queries the column \code{label} of the table in field \code{col_info}. diff --git a/man/as_resampling.Rd b/man/as_resampling.Rd index 02784a7ec..1ed91e75a 100644 --- a/man/as_resampling.Rd +++ b/man/as_resampling.Rd @@ -30,5 +30,5 @@ If \code{TRUE}, ensures that the returned object is not the same as the input \c } \description{ Convert object to a \link{Resampling} or a list of \link{Resampling}. -This method e.g. allows to convert an \code{\link[mlr3oml:oml_task]{mlr3oml::OMLTask}} to a \code{\link{Resampling}}. +This method e.g. allows to convert an \code{\link[mlr3oml:OMLTask]{mlr3oml::OMLTask}} to a \code{\link{Resampling}}. } diff --git a/man/as_task.Rd b/man/as_task.Rd index 8aff6071e..ca7b2060e 100644 --- a/man/as_task.Rd +++ b/man/as_task.Rd @@ -30,6 +30,6 @@ If \code{TRUE}, ensures that the returned object is not the same as the input \c } \description{ Convert object to a \link{Task} or a list of \link{Task}. -This method e.g. allows to convert an \code{\link[mlr3oml:oml_task]{mlr3oml::OMLTask}} to a \code{\link{Task}} and additionally supports cloning. +This method e.g. allows to convert an \code{\link[mlr3oml:OMLTask]{mlr3oml::OMLTask}} to a \code{\link{Task}} and additionally supports cloning. In order to construct a \link{Task} from a \code{data.frame}, use task-specific converters such as \code{\link[=as_task_classif]{as_task_classif()}} or \code{\link[=as_task_regr]{as_task_regr()}}. } diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R index da6c09d30..487795093 100644 --- a/tests/testthat/test_TaskClassif.R +++ b/tests/testthat/test_TaskClassif.R @@ -115,9 +115,12 @@ test_that("target is encoded as factor (#629)", { test_that("offset column role works with binary tasks", { task = tsk("pima") - task$set_col_roles("age", "offset") + expect_null(task$offset) + task$set_col_roles("age", "offset") expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 1) + expect_subset("age", names(task$offset)) expect_error({ task$col_roles$offset = c("glucose", "diabetes") @@ -130,8 +133,11 @@ test_that("offset column role works with binary tasks", { test_that("offset column role works with multiclass tasks", { task = tsk("penguins") + expect_null(task$offset) task$set_col_roles("year", "offset") expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 1) + expect_subset("year", names(task$offset)) expect_error({ task$col_roles$offset = "bill_length" @@ -151,4 +157,6 @@ test_that("offset column role works with multiclass tasks", { task$set_col_roles(c("offset_1", "offset_2"), "offset") expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 2) + expect_subset(c("offset_1", "offset_2"), names(task$offset)) }) diff --git a/tests/testthat/test_TaskRegr.R b/tests/testthat/test_TaskRegr.R index ab2c36d7c..38739beb9 100644 --- a/tests/testthat/test_TaskRegr.R +++ b/tests/testthat/test_TaskRegr.R @@ -52,9 +52,12 @@ test_that("$add_strata", { test_that("offset column role works", { task = tsk("mtcars") + expect_null(task$offset) task$set_col_roles("am", "offset") expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 1) + expect_subset("am", names(task$offset)) expect_error({ task$col_roles$offset = c("am", "gear") @@ -62,4 +65,5 @@ test_that("offset column role works", { task$col_roles$offset = character() expect_true("offset" %nin% task$properties) + expect_null(task$offset) }) From e380fd4b5e28171c4975ce57719e019950afdfdc Mon Sep 17 00:00:00 2001 From: john Date: Wed, 22 Jan 2025 17:43:13 +0100 Subject: [PATCH 18/20] add warning during training when task has offset but learner doesn't support this --- R/assertions.R | 5 +++++ tests/testthat/test_TaskClassif.R | 2 ++ 2 files changed, 7 insertions(+) diff --git a/R/assertions.R b/R/assertions.R index bd1a529ba..d706de9ee 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -145,6 +145,11 @@ assert_task_learner = function(task, learner, cols = NULL) { } } + if ("offset" %in% task$properties && "offset" %nin% learner$properties) { + warningf("Task '%s' has offset, but learner '%s' does not support this, so it will be ignored", + task$id, learner$id) + } + tmp = mlr_reflections$task_mandatory_properties[[task$task_type]] if (length(tmp)) { tmp = setdiff(intersect(task$properties, tmp), learner$properties) diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R index 487795093..27156699c 100644 --- a/tests/testthat/test_TaskClassif.R +++ b/tests/testthat/test_TaskClassif.R @@ -129,6 +129,8 @@ test_that("offset column role works with binary tasks", { expect_error({ task$col_roles$offset = c("glucose") }, "contain missing values") + + expect_warning(lrn("classif.rpart")$train(task), "has offset") }) test_that("offset column role works with multiclass tasks", { From 8bd0ab76f9c3861e1905453015094631d97e9039 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 31 Jan 2025 15:27:45 +0100 Subject: [PATCH 19/20] add documentation for offset learner property --- man-roxygen/param_learner_properties.R | 2 ++ man/Learner.Rd | 1 + man/LearnerClassif.Rd | 1 + man/LearnerRegr.Rd | 1 + man/as_resampling.Rd | 2 +- man/as_task.Rd | 2 +- 6 files changed, 7 insertions(+), 2 deletions(-) diff --git a/man-roxygen/param_learner_properties.R b/man-roxygen/param_learner_properties.R index b53e6d420..426cab517 100644 --- a/man-roxygen/param_learner_properties.R +++ b/man-roxygen/param_learner_properties.R @@ -4,6 +4,7 @@ #' The following properties are currently standardized and understood by learners in \CRANpkg{mlr3}: #' * `"missings"`: The learner can handle missing values in the data. #' * `"weights"`: The learner supports observation weights. +#' * `"offset"`: The learner can incorporate offset values to adjust predictions. #' * `"importance"`: The learner supports extraction of importance scores, i.e. comes with an `$importance()` extractor function (see section on optional extractors in [Learner]). #' * `"selected_features"`: The learner supports extraction of the set of selected features, i.e. comes with a `$selected_features()` extractor function (see section on optional extractors in [Learner]). #' * `"oob_error"`: The learner supports extraction of estimated out of bag error, i.e. comes with a `oob_error()` extractor function (see section on optional extractors in [Learner]). @@ -11,3 +12,4 @@ #' * `"internal_tuning"`: The learner is able to internally optimize hyperparameters (those are also tagged with `"internal_tuning"`). #' * `"marshal"`: To save learners with this property, you need to call `$marshal()` first. #' If a learner is in a marshaled state, you call first need to call `$unmarshal()` to use its model, e.g. for prediction. +#' diff --git a/man/Learner.Rd b/man/Learner.Rd index ca54c1a11..574acb1a9 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -373,6 +373,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/LearnerClassif.Rd b/man/LearnerClassif.Rd index 64a29cf74..06fd5ca36 100644 --- a/man/LearnerClassif.Rd +++ b/man/LearnerClassif.Rd @@ -138,6 +138,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/LearnerRegr.Rd b/man/LearnerRegr.Rd index dca2085c7..5d8825bb7 100644 --- a/man/LearnerRegr.Rd +++ b/man/LearnerRegr.Rd @@ -142,6 +142,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/as_resampling.Rd b/man/as_resampling.Rd index 1ed91e75a..02784a7ec 100644 --- a/man/as_resampling.Rd +++ b/man/as_resampling.Rd @@ -30,5 +30,5 @@ If \code{TRUE}, ensures that the returned object is not the same as the input \c } \description{ Convert object to a \link{Resampling} or a list of \link{Resampling}. -This method e.g. allows to convert an \code{\link[mlr3oml:OMLTask]{mlr3oml::OMLTask}} to a \code{\link{Resampling}}. +This method e.g. allows to convert an \code{\link[mlr3oml:oml_task]{mlr3oml::OMLTask}} to a \code{\link{Resampling}}. } diff --git a/man/as_task.Rd b/man/as_task.Rd index ca7b2060e..8aff6071e 100644 --- a/man/as_task.Rd +++ b/man/as_task.Rd @@ -30,6 +30,6 @@ If \code{TRUE}, ensures that the returned object is not the same as the input \c } \description{ Convert object to a \link{Task} or a list of \link{Task}. -This method e.g. allows to convert an \code{\link[mlr3oml:OMLTask]{mlr3oml::OMLTask}} to a \code{\link{Task}} and additionally supports cloning. +This method e.g. allows to convert an \code{\link[mlr3oml:oml_task]{mlr3oml::OMLTask}} to a \code{\link{Task}} and additionally supports cloning. In order to construct a \link{Task} from a \code{data.frame}, use task-specific converters such as \code{\link[=as_task_classif]{as_task_classif()}} or \code{\link[=as_task_regr]{as_task_regr()}}. } From e9d651e6b2c02736cb80c3ab413e13a2c941c573 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 31 Jan 2025 17:43:14 +0100 Subject: [PATCH 20/20] update news --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 156b8b111..85b06bdac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # mlr3 (development version) -* feat: add new `col_role` offset in `Task`. +* feat: add new `col_role` offset in `Task` and offset `Learner` property. +A warning is produced if a learner that doesn't support offsets is trained with a task that has an offset column. * fix: the `$predict_newdata()` method of `Learner` now automatically conducts type conversions (#685) * BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. * Column names with UTF-8 characters are now allowed by default.