diff --git a/.Rbuildignore b/.Rbuildignore index b8486320..fc58474a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -19,3 +19,4 @@ ^CODE_OF_CONDUCT\.md$ ^inst/manuscript/output$ ^CRAN-SUBMISSION$ +^.vscode diff --git a/NAMESPACE b/NAMESPACE index f694e1e2..068c220a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -82,7 +82,6 @@ export(quantile_score) export(score) export(se_mean_sample) export(select_metrics) -export(set_forecast_unit) export(summarise_scores) export(summarize_scores) export(theme_scoringutils) diff --git a/NEWS.md b/NEWS.md index 2ca5f461..a8fff9c1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -82,6 +82,7 @@ of our [original](https://doi.org/10.48550/arXiv.2205.07090) `scoringutils` pape - Removed the function `plot_score_table()`. You can find the code in the Deprecated-visualisations Vignette. - Removed the function `merge_pred_and_obs()` that was used to merge two separate data frames with forecasts and observations. We moved its contents to a new "Deprecated functions"-vignette. - Removed `interval_coverage_sample()` as users are now expected to convert to a quantile format first before scoring. +- Function `set_forecast_unit()` was deleted. Instead there is now a `forecast_unit` argument in `as_forecast_()` as well as in `get_duplicate_forecasts()`. ### Function changes - `bias_quantile()` changed the way it handles forecasts where the median is missing: The median is now imputed by linear interpolation between the innermost quantiles. Previously, we imputed the median by simply taking the mean of the innermost quantiles. diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 4bbb93da..3d8196f7 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -266,12 +266,11 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' scoring or denote the unit of a single forecast as specified by the user. #' @importFrom data.table ':=' is.data.table copy #' @importFrom checkmate assert_character assert_subset -#' @export #' @keywords as_forecast #' @examples #' library(magrittr) # pipe operator #' example_quantile %>% -#' set_forecast_unit( +#' scoringutils:::set_forecast_unit( #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) set_forecast_unit <- function(data, forecast_unit) { diff --git a/R/forecast.R b/R/forecast.R index ac17d1f4..4165d2d7 100644 --- a/R/forecast.R +++ b/R/forecast.R @@ -16,7 +16,7 @@ #' existing columns of their input data to match the required columns for a #' forecast object. Using the argument `forecast_unit`, users can specify the #' the columns that uniquely identify a single forecast (and remove the others, -#' see [set_forecast_unit()] for details). +#' see docs for the internal [set_forecast_unit()] for details). #' #' The following functions are available: #' - [as_forecast_point()] diff --git a/R/get_-functions.R b/R/get_-functions.R index b0038cd4..3836f75a 100644 --- a/R/get_-functions.R +++ b/R/get_-functions.R @@ -279,11 +279,11 @@ get_protected_columns <- function(data = NULL) { #' @title Find duplicate forecasts #' #' @description -#' Helper function to identify duplicate forecasts, i.e. +#' Internal helper function to identify duplicate forecasts, i.e. #' instances where there is more than one forecast for the same prediction #' target. #' -#' @param data A data.frame as used for [score()] +#' @inheritParams as_forecast #' @param counts Should the output show the number of duplicates per forecast #' unit instead of the individual duplicated rows? Default is `FALSE`. #' @return A data.frame with all rows for which a duplicate forecast was found @@ -297,10 +297,15 @@ get_protected_columns <- function(data = NULL) { get_duplicate_forecasts <- function( data, + forecast_unit = NULL, counts = FALSE ) { assert_data_frame(data) data <- ensure_data.table(data) + + if (!is.null(forecast_unit)) { + data <- set_forecast_unit(data, forecast_unit) + } forecast_unit <- get_forecast_unit(data) available_type <- c("sample_id", "quantile_level", "predicted_label") %in% colnames(data) type <- c("sample_id", "quantile_level", "predicted_label")[available_type] diff --git a/man/as_forecast.Rd b/man/as_forecast.Rd index 629ee10d..f57886dc 100644 --- a/man/as_forecast.Rd +++ b/man/as_forecast.Rd @@ -48,7 +48,7 @@ Using the arguments \code{observed}, \code{predicted}, \code{model}, etc. users existing columns of their input data to match the required columns for a forecast object. Using the argument \code{forecast_unit}, users can specify the the columns that uniquely identify a single forecast (and remove the others, -see \code{\link[=set_forecast_unit]{set_forecast_unit()}} for details). +see docs for the internal \code{\link[=set_forecast_unit]{set_forecast_unit()}} for details). The following functions are available: \itemize{ diff --git a/man/check_duplicates.Rd b/man/check_duplicates.Rd index d39df769..7473f1e1 100644 --- a/man/check_duplicates.Rd +++ b/man/check_duplicates.Rd @@ -7,7 +7,9 @@ check_duplicates(data) } \arguments{ -\item{data}{A data.frame as used for \code{\link[=score]{score()}}} +\item{data}{A data.frame (or similar) with predicted and observed values. +See the details section of \code{\link[=as_forecast]{as_forecast()}} for additional information +on required input formats.} } \value{ Returns TRUE if the check was successful and a string with an diff --git a/man/get_duplicate_forecasts.Rd b/man/get_duplicate_forecasts.Rd index db6f5a89..298d0bd5 100644 --- a/man/get_duplicate_forecasts.Rd +++ b/man/get_duplicate_forecasts.Rd @@ -4,10 +4,19 @@ \alias{get_duplicate_forecasts} \title{Find duplicate forecasts} \usage{ -get_duplicate_forecasts(data, counts = FALSE) +get_duplicate_forecasts(data, forecast_unit = NULL, counts = FALSE) } \arguments{ -\item{data}{A data.frame as used for \code{\link[=score]{score()}}} +\item{data}{A data.frame (or similar) with predicted and observed values. +See the details section of \code{\link[=as_forecast]{as_forecast()}} for additional information +on required input formats.} + +\item{forecast_unit}{(optional) Name of the columns in \code{data} (after +any renaming of columns) that denote the unit of a +single forecast. See \code{\link[=get_forecast_unit]{get_forecast_unit()}} for details. +If \code{NULL} (the default), all columns that are not required columns are +assumed to form the unit of a single forecast. If specified, all columns +that are not part of the forecast unit (or required columns) will be removed.} \item{counts}{Should the output show the number of duplicates per forecast unit instead of the individual duplicated rows? Default is \code{FALSE}.} @@ -16,7 +25,7 @@ unit instead of the individual duplicated rows? Default is \code{FALSE}.} A data.frame with all rows for which a duplicate forecast was found } \description{ -Helper function to identify duplicate forecasts, i.e. +Internal helper function to identify duplicate forecasts, i.e. instances where there is more than one forecast for the same prediction target. } diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 53ca80d8..d609442e 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -36,7 +36,7 @@ easier to debug and easier to read. \examples{ library(magrittr) # pipe operator example_quantile \%>\% - set_forecast_unit( + scoringutils:::set_forecast_unit( c("location", "target_end_date", "target_type", "horizon", "model") ) } diff --git a/tests/testthat/test-get_-functions.R b/tests/testthat/test-get_-functions.R index 6a785778..4e765e29 100644 --- a/tests/testthat/test-get_-functions.R +++ b/tests/testthat/test-get_-functions.R @@ -165,6 +165,14 @@ test_that("get_type() handles `NA` values", { # get_duplicate_forecasts() # ============================================================================== test_that("get_duplicate_forecasts() works as expected for quantile", { + expect_no_condition(get_duplicate_forecasts( + example_quantile, + forecast_unit = + c("location", "target_end_date", "target_type", "location_name", + "forecast_date", "model") + ) + ) + expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) expect_equal( nrow(