diff --git a/DESCRIPTION b/DESCRIPTION index 16060dae1..4109f29a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,9 @@ Authors@R: comment = c(ORCID = "0000-0002-8115-0400")), person("Sebastian", "Fischer", , "sebf.fischer@gmail.com", role = "aut", comment = c(ORCID = "0000-0002-9609-3197")), - person("Lona", "Koers", , "lona.koers@gmail.com", role = "ctb") + person("Lona", "Koers", , "lona.koers@gmail.com", role = "ctb"), + person("John", "Zobolas", , "bblodfon@gmail.com", role = "ctb", + comment = c(ORCID = "0000-0002-3609-8674")) ) Description: Efficient, object-oriented programming on the building blocks of machine learning. Provides 'R6' objects for tasks, diff --git a/NEWS.md b/NEWS.md index 89892f152..85b06bdac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,9 @@ # mlr3 (development version) +* feat: add new `col_role` offset in `Task` and offset `Learner` property. +A warning is produced if a learner that doesn't support offsets is trained with a task that has an offset column. * fix: the `$predict_newdata()` method of `Learner` now automatically conducts type conversions (#685) -* BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. +* BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. * Column names with UTF-8 characters are now allowed by default. The option `mlr3.allow_utf8_names` is removed. * BREAKING CHANGE: `Learner$predict_types` is read-only now. diff --git a/R/Task.R b/R/Task.R index 66b7cafc5..593f359b1 100644 --- a/R/Task.R +++ b/R/Task.R @@ -496,7 +496,7 @@ Task = R6Class("Task", } # columns with these roles must be present in data - mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order") + mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order", "offset") mandatory_cols = unlist(private$.col_roles[mandatory_roles], use.names = FALSE) missing_cols = setdiff(mandatory_cols, data$colnames) if (length(missing_cols)) { @@ -896,6 +896,7 @@ Task = R6Class("Task", #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`). #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`). #' * `"weights"`: The task comes with observation weights (role `"weight"`). + #' * `"offset"`: The task includes one or more offset columns specifying fixed adjustments for model training and possibly for prediction (role `"offset"`). #' * `"ordered"`: The task has columns which define the row order (role `"order"`). #' #' Note that above listed properties are calculated from the `$col_roles` and may not be set explicitly. @@ -907,6 +908,7 @@ Task = R6Class("Task", if (length(col_roles$group)) "groups" else NULL, if (length(col_roles$stratum)) "strata" else NULL, if (length(col_roles$weight)) "weights" else NULL, + if (length(col_roles$offset)) "offset" else NULL, if (length(col_roles$order)) "ordered" else NULL ) } else { @@ -951,6 +953,11 @@ Task = R6Class("Task", #' Not more than a single column can be associated with this role. #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role. #' * `"weight"`: Observation weights. Not more than one numeric column may have this role. + #' * `"offset"`: Numeric columns used to specify fixed adjustments for model training. + #' Some models use offsets to simply shift predictions, while others incorporate them to boost predictions from a baseline model. + #' For learners supporting offsets in multiclass settings, an offset column must be provided for each target class. + #' These columns must follow the naming convention `"offset_{target_class_name}"`. + #' For an example of a learner that supports offsets, see [LearnerClassifXgboost][mlr3learners::LearnerClassifXgboost]. #' #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names. #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots). @@ -1084,6 +1091,27 @@ Task = R6Class("Task", setnames(data, c("row_id", "weight"))[] }, + #' @field offset ([data.table::data.table()])\cr + #' If the task has a column with designated role `"offset"`, a table with two or more columns: + #' + #' * `row_id` (`integer()`), and + #' * offset variable(s) (`numeric()`). + #' + #' For regression or binary classification tasks, there will be only a single-column offset. + #' For multiclass tasks, it may return multiple offset columns, one for each target class. + #' + #' If there are no columns with the `"offset"` role, `NULL` is returned. + offset = function(rhs) { + assert_has_backend(self) + assert_ro_binding(rhs) + offset_cols = private$.col_roles$offset + if (length(offset_cols) == 0L) { + return(NULL) + } + + data = self$backend$data(private$.row_roles$use, c(self$backend$primary_key, offset_cols)) + setnames(data, c("row_id", offset_cols))[] + }, #' @field labels (named `character()`)\cr #' Retrieve `labels` (prettier formated names) from columns. @@ -1250,6 +1278,17 @@ task_check_col_roles.Task = function(task, new_roles, ...) { } } + # check offset + if (length(new_roles[["offset"]]) && any(fget(task$col_info, new_roles[["offset"]], "type", key = "id") %nin% c("numeric", "integer"))) { + stopf("Offset column(s) %s must be a numeric or integer column", paste0("'", new_roles[["offset"]], "'", collapse = ",")) + } + + if (any(task$missings(cols = new_roles[["offset"]]) > 0)) { + missings = task$missings(cols = new_roles[["offset"]]) + missings = names(missings[missings > 0]) + stopf("Offset column(s) %s contain missing values", paste0("'", missings, "'", collapse = ",")) + } + return(new_roles) } @@ -1266,16 +1305,25 @@ task_check_col_roles.TaskClassif = function(task, new_roles, ...) { stopf("Target column(s) %s must be a factor or ordered factor", paste0("'", new_roles[["target"]], "'", collapse = ",")) } + if (length(new_roles[["offset"]]) > 1L && length(task$class_names) == 2L) { + stop("There may only be up to one column with role 'offset' for binary classification tasks") + } + + if (length(new_roles[["offset"]]) > 1L) { + expected_names = paste0("offset_", task$class_names) + expect_subset(new_roles[["offset"]], expected_names, label = "col_roles") + } + NextMethod() } #' @rdname task_check_col_roles #' @export task_check_col_roles.TaskRegr = function(task, new_roles, ...) { - - # check target - if (length(new_roles[["target"]]) > 1L) { - stopf("There may only be up to one column with role 'target'") + for (role in c("target", "offset")) { + if (length(new_roles[[role]]) > 1L) { + stopf("There may only be up to one column with role '%s'", role) + } } if (length(new_roles[["target"]]) && any(fget(task$col_info, new_roles[["target"]], "type", key = "id") %nin% c("numeric", "integer"))) { diff --git a/R/assertions.R b/R/assertions.R index bd1a529ba..d706de9ee 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -145,6 +145,11 @@ assert_task_learner = function(task, learner, cols = NULL) { } } + if ("offset" %in% task$properties && "offset" %nin% learner$properties) { + warningf("Task '%s' has offset, but learner '%s' does not support this, so it will be ignored", + task$id, learner$id) + } + tmp = mlr_reflections$task_mandatory_properties[[task$task_type]] if (length(tmp)) { tmp = setdiff(intersect(task$properties, tmp), learner$properties) diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R index 4ac532741..7d6ae5fec 100644 --- a/R/mlr_reflections.R +++ b/R/mlr_reflections.R @@ -94,14 +94,14 @@ local({ "use" ) - tmp = c("feature", "target", "name", "order", "stratum", "group", "weight") + tmp = c("feature", "target", "name", "order", "stratum", "group", "weight", "offset") mlr_reflections$task_col_roles = list( regr = tmp, classif = tmp, unsupervised = c("feature", "name", "order") ) - tmp = c("strata", "groups", "weights") + tmp = c("strata", "groups", "weights", "offset") mlr_reflections$task_properties = list( classif = c(tmp, "twoclass", "multiclass"), regr = tmp, @@ -114,11 +114,11 @@ local({ mlr_reflections$task_print_col_roles = list( before = character(), - after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight") + after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight", "Offset" = "offset") ) ### Learner - tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal") + tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal", "offset") mlr_reflections$learner_properties = list( classif = c(tmp, "twoclass", "multiclass"), regr = tmp diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R index fc3fcc41b..16ddc4569 100644 --- a/inst/testthat/helper_autotest.R +++ b/inst/testthat/helper_autotest.R @@ -74,11 +74,17 @@ generate_generic_tasks = function(learner, proto) { # task with weights if ("weights" %in% learner$properties) { tmp = proto$clone(deep = TRUE)$cbind(data.frame(weights = runif(n))) - tmp$col_roles$weight = "weights" - tmp$col_roles$features = setdiff(tmp$col_roles$features, "weights") + tmp$set_col_roles(cols = "weights", roles = "weight") tasks$weights = tmp } + # task with offset + if ("offset" %in% learner$properties) { + tmp = proto$clone(deep = TRUE)$cbind(data.frame(offset = runif(n))) + tmp$set_col_roles(cols = "offset", roles = "offset") + tasks$offset = tmp + } + # task with non-ascii feature names if (p > 0L) { sel = proto$feature_types[list(learner$feature_types), "id", on = "type", with = FALSE, nomatch = NULL][[1L]] diff --git a/man-roxygen/param_learner_properties.R b/man-roxygen/param_learner_properties.R index b53e6d420..426cab517 100644 --- a/man-roxygen/param_learner_properties.R +++ b/man-roxygen/param_learner_properties.R @@ -4,6 +4,7 @@ #' The following properties are currently standardized and understood by learners in \CRANpkg{mlr3}: #' * `"missings"`: The learner can handle missing values in the data. #' * `"weights"`: The learner supports observation weights. +#' * `"offset"`: The learner can incorporate offset values to adjust predictions. #' * `"importance"`: The learner supports extraction of importance scores, i.e. comes with an `$importance()` extractor function (see section on optional extractors in [Learner]). #' * `"selected_features"`: The learner supports extraction of the set of selected features, i.e. comes with a `$selected_features()` extractor function (see section on optional extractors in [Learner]). #' * `"oob_error"`: The learner supports extraction of estimated out of bag error, i.e. comes with a `oob_error()` extractor function (see section on optional extractors in [Learner]). @@ -11,3 +12,4 @@ #' * `"internal_tuning"`: The learner is able to internally optimize hyperparameters (those are also tagged with `"internal_tuning"`). #' * `"marshal"`: To save learners with this property, you need to call `$marshal()` first. #' If a learner is in a marshaled state, you call first need to call `$unmarshal()` to use its model, e.g. for prediction. +#' diff --git a/man/Learner.Rd b/man/Learner.Rd index ca54c1a11..574acb1a9 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -373,6 +373,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/LearnerClassif.Rd b/man/LearnerClassif.Rd index 64a29cf74..06fd5ca36 100644 --- a/man/LearnerClassif.Rd +++ b/man/LearnerClassif.Rd @@ -138,6 +138,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/LearnerRegr.Rd b/man/LearnerRegr.Rd index dca2085c7..5d8825bb7 100644 --- a/man/LearnerRegr.Rd +++ b/man/LearnerRegr.Rd @@ -142,6 +142,7 @@ The following properties are currently standardized and understood by learners i \itemize{ \item \code{"missings"}: The learner can handle missing values in the data. \item \code{"weights"}: The learner supports observation weights. +\item \code{"offset"}: The learner can incorporate offset values to adjust predictions. \item \code{"importance"}: The learner supports extraction of importance scores, i.e. comes with an \verb{$importance()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"selected_features"}: The learner supports extraction of the set of selected features, i.e. comes with a \verb{$selected_features()} extractor function (see section on optional extractors in \link{Learner}). \item \code{"oob_error"}: The learner supports extraction of estimated out of bag error, i.e. comes with a \code{oob_error()} extractor function (see section on optional extractors in \link{Learner}). diff --git a/man/Task.Rd b/man/Task.Rd index 53a7afe97..9272afcc3 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -198,6 +198,7 @@ The following properties are currently standardized and understood by tasks in \ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). \item \code{"weights"}: The task comes with observation weights (role \code{"weight"}). +\item \code{"offset"}: The task includes one or more offset columns specifying fixed adjustments for model training and possibly for prediction (role \code{"offset"}). \item \code{"ordered"}: The task has columns which define the row order (role \code{"order"}). } @@ -226,6 +227,11 @@ For each resampling iteration, observations of the same group will be exclusivel Not more than a single column can be associated with this role. \item \code{"stratum"}: Stratification variables. Multiple discrete columns may have this role. \item \code{"weight"}: Observation weights. Not more than one numeric column may have this role. +\item \code{"offset"}: Numeric columns used to specify fixed adjustments for model training. +Some models use offsets to simply shift predictions, while others incorporate them to boost predictions from a baseline model. +For learners supporting offsets in multiclass settings, an offset column must be provided for each target class. +These columns must follow the naming convention \code{"offset_{target_class_name}"}. +For an example of a learner that supports offsets, see \link[mlr3learners:mlr_learners_classif.xgboost]{LearnerClassifXgboost}. } \code{col_roles} is a named list whose elements are named by column role and each element is a \code{character()} vector of column names. @@ -286,6 +292,18 @@ If the task has a column with designated role \code{"weight"}, a table with two Returns \code{NULL} if there are is no weight column.} +\item{\code{offset}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr +If the task has a column with designated role \code{"offset"}, a table with two or more columns: +\itemize{ +\item \code{row_id} (\code{integer()}), and +\item offset variable(s) (\code{numeric()}). +} + +For regression or binary classification tasks, there will be only a single-column offset. +For multiclass tasks, it may return multiple offset columns, one for each target class. + +If there are no columns with the \code{"offset"} role, \code{NULL} is returned.} + \item{\code{labels}}{(named \code{character()})\cr Retrieve \code{labels} (prettier formated names) from columns. Internally queries the column \code{label} of the table in field \code{col_info}. diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd index 9c4f5f689..71e978ee8 100644 --- a/man/mlr3-package.Rd +++ b/man/mlr3-package.Rd @@ -105,6 +105,7 @@ Other contributors: \item Quay Au \email{quayau@gmail.com} (\href{https://orcid.org/0000-0002-5252-8902}{ORCID}) [contributor] \item Lennart Schneider \email{lennart.sch@web.de} (\href{https://orcid.org/0000-0003-4152-5308}{ORCID}) [contributor] \item Lona Koers \email{lona.koers@gmail.com} [contributor] + \item John Zobolas \email{bblodfon@gmail.com} (\href{https://orcid.org/0000-0002-3609-8674}{ORCID}) [contributor] } } diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index fbfba18eb..ab084c430 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -248,15 +248,18 @@ test_that("stratify works", { }) test_that("groups/weights work", { - b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), g = sample(letters[1:2], 20, replace = TRUE))) + b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), + o = runif(20), g = sample(letters[1:2], 20, replace = TRUE))) task = TaskRegr$new("test", b, target = "y") task$set_row_roles(16:20, character()) expect_false("groups" %chin% task$properties) expect_false("weights" %chin% task$properties) + expect_false("offset" %chin% task$properties) expect_null(task$groups) expect_null(task$weights) + # weight task$col_roles$weight = "w" expect_subset("weights", task$properties) expect_data_table(task$weights, ncols = 2, nrows = 15) @@ -265,6 +268,7 @@ test_that("groups/weights work", { task$col_roles$weight = character() expect_true("weights" %nin% task$properties) + # group task$col_roles$group = "g" expect_subset("groups", task$properties) expect_data_table(task$groups, ncols = 2, nrows = 15) @@ -726,3 +730,4 @@ test_that("warn when internal valid task has 0 obs", { task = tsk("iris") expect_warning({task$internal_valid_task = 151}, "has 0 observations") }) + diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R index cd00edb5b..807f1ab98 100644 --- a/tests/testthat/test_TaskClassif.R +++ b/tests/testthat/test_TaskClassif.R @@ -112,3 +112,53 @@ test_that("target is encoded as factor (#629)", { dt$target = ordered(dt$target) TaskClassif$new(id = "XX", backend = dt, target = "target") }) + +test_that("offset column role works with binary tasks", { + task = tsk("pima") + expect_null(task$offset) + + task$set_col_roles("age", "offset") + expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 2) + expect_subset("age", names(task$offset)) + + expect_error({ + task$col_roles$offset = c("glucose", "diabetes") + }, "There may only be up to one column with role") + + expect_error({ + task$col_roles$offset = c("glucose") + }, "contain missing values") + + expect_warning(lrn("classif.rpart")$train(task), "has offset") +}) + +test_that("offset column role works with multiclass tasks", { + task = tsk("penguins") + expect_null(task$offset) + task$set_col_roles("year", "offset") + expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 2) + expect_subset(c("row_id", "year"), names(task$offset)) + + expect_error({ + task$col_roles$offset = "bill_length" + }, "contain missing values") + + task = tsk("wine") + + expect_error({ + task$col_roles$offset = c("alcohol", "ash") + }, "Must be a subset of") + + task = tsk("wine") + data = task$data() + set(data, j = "offset_1", value = runif(nrow(data))) + set(data, j = "offset_2", value = runif(nrow(data))) + task = as_task_classif(data, target = "type") + task$set_col_roles(c("offset_1", "offset_2"), "offset") + + expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 3) + expect_subset(c("row_id", "offset_1", "offset_2"), names(task$offset)) +}) diff --git a/tests/testthat/test_TaskRegr.R b/tests/testthat/test_TaskRegr.R index 29ade6eae..b5e9c939c 100644 --- a/tests/testthat/test_TaskRegr.R +++ b/tests/testthat/test_TaskRegr.R @@ -49,3 +49,21 @@ test_that("$add_strata", { task$add_strata(task$target_names, bins = 2) expect_identical(task$strata$N, c(50L, 10L)) }) + +test_that("offset column role works", { + task = tsk("mtcars") + expect_null(task$offset) + task$set_col_roles("am", "offset") + + expect_subset("offset", task$properties) + expect_data_table(task$offset, nrows = task$nrow, ncols = 2) + expect_subset(c("row_id", "am"), names(task$offset)) + + expect_error({ + task$col_roles$offset = c("am", "gear") + }, "up to one") + + task$col_roles$offset = character() + expect_true("offset" %nin% task$properties) + expect_null(task$offset) +})