diff --git a/NAMESPACE b/NAMESPACE index 67415dbe9..d3b6ffe22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,8 +28,12 @@ export(bias_sample) export(brier_score) export(correlation) export(crps_sample) +export(dispersion) export(dss_sample) export(get_duplicate_forecasts) +export(interval_coverage_deviation_quantile) +export(interval_coverage_quantile) +export(interval_coverage_sample) export(interval_score) export(log_shift) export(logs_binary) @@ -39,6 +43,7 @@ export(make_NA) export(make_na) export(merge_pred_and_obs) export(new_scoringutils) +export(overprediction) export(pairwise_comparison) export(pit) export(pit_sample) @@ -54,6 +59,7 @@ export(plot_ranges) export(plot_score_table) export(plot_wis) export(quantile_score) +export(run_safely) export(sample_to_quantile) export(score) export(se_mean_sample) @@ -63,8 +69,10 @@ export(summarise_scores) export(summarize_scores) export(theme_scoringutils) export(transform_forecasts) +export(underprediction) export(validate) export(validate_general) +export(wis) importFrom(Metrics,ae) importFrom(Metrics,ape) importFrom(Metrics,se) @@ -74,12 +82,15 @@ importFrom(checkmate,assert_data_frame) importFrom(checkmate,assert_data_table) importFrom(checkmate,assert_factor) importFrom(checkmate,assert_list) +importFrom(checkmate,assert_logical) +importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,check_atomic_vector) importFrom(checkmate,check_data_frame) importFrom(checkmate,check_function) importFrom(checkmate,check_matrix) importFrom(checkmate,check_numeric) +importFrom(checkmate,check_vector) importFrom(checkmate,test_factor) importFrom(checkmate,test_list) importFrom(checkmate,test_numeric) @@ -98,6 +109,7 @@ importFrom(data.table,nafill) importFrom(data.table,rbindlist) importFrom(data.table,setDT) importFrom(data.table,setattr) +importFrom(data.table,setcolorder) importFrom(data.table,setnames) importFrom(ggdist,geom_lineribbon) importFrom(ggplot2,.data) @@ -157,5 +169,6 @@ importFrom(stats,rbinom) importFrom(stats,reorder) importFrom(stats,runif) importFrom(stats,sd) +importFrom(stats,weighted.mean) importFrom(stats,wilcox.test) importFrom(utils,combn) diff --git a/NEWS.md b/NEWS.md index 718a0db9c..0faf9eaf2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,7 @@ The update introduces breaking changes. If you want to keep using the older vers - `quantile`: numeric, a vector with quantile-levels. Can alternatively be a matrix of the same shape as `predicted`. - `check_forecasts()` was replaced by a new function `validate()`. `validate()` validates the input and in that sense fulfills the purpose of `check_forecasts()`. It has different methods: `validate.default()` assigns the input a class based on their forecast type. Other methods validate the input specifically for the various forecast types. - The functionality for computing pairwise comparisons was now split from `summarise_scores()`. Instead of doing pairwise comparisons as part of summarising scores, a new function, `add_pairwise_comparison()`, was introduced that takes summarised scores as an input and adds pairwise comparisons to it. +- `add_coverage()` was reworked completely. It's new purpose is now to add coverage information to the raw forecast data (essentially fulfilling some of the functionality that was previously covered by `score_quantile()`) - The function `find_duplicates()` was renamed to `get_duplicate_forecasts()` - Changes to `avail_forecasts()` and `plot_avail_forecasts()`: - The function `avail_forecasts()` was renamed to `available_forecasts()` for consistency with `available_metrics()`. The old function, `avail_forecasts()` is still available as an alias, but will be removed in the future. diff --git a/R/add_coverage.R b/R/add_coverage.R new file mode 100644 index 000000000..684556026 --- /dev/null +++ b/R/add_coverage.R @@ -0,0 +1,83 @@ +#' @title Add Coverage Values to Quantile-Based Forecasts +#' +#' @description Adds interval coverage of central prediction intervals, +#' quantile coverage for predictive quantiles, as well as the deviation between +#' desired and actual coverage to a data.table. Forecasts should be in a +#' quantile format (following the input requirements of `score()`). +#' +#' **Interval coverage** +#' +#' Coverage for a given interval range is defined as the proportion of +#' observations that fall within the corresponding central prediction intervals. +#' Central prediction intervals are symmetric around the median and and formed +#' by two quantiles that denote the lower and upper bound. For example, the 50% +#' central prediction interval is the interval between the 0.25 and 0.75 +#' quantiles of the predictive distribution. +#' +#' The function `add_coverage()` computes the coverage per central prediction +#' interval, so the coverage will always be either `TRUE` (observed value falls +#' within the interval) or `FALSE` (observed value falls outside the interval). +#' You can summarise the coverage values to get the proportion of observations +#' that fall within the central prediction intervals. +#' +#' **Quantile coverage** +#' +#' Quantile coverage for a given quantile is defined as the proportion of +#' observed values that are smaller than the corresponding predictive quantile. +#' For example, the 0.5 quantile coverage is the proportion of observed values +#' that are smaller than the 0.5 quantile of the predictive distribution. +#' +#' **Coverage deviation** +#' +#' The coverage deviation is the difference between the desired coverage and the +#' actual coverage. For example, if the desired coverage is 90% and the actual +#' coverage is 80%, the coverage deviation is -0.1. +#' +#' @inheritParams score +#' @return a data.table with the input and columns "interval_coverage", +#' "interval_coverage_deviation", "quantile_coverage", +#' "quantile_coverage_deviation" added. +#' @importFrom data.table setcolorder +#' @examples +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' add_coverage() +#' @export +#' @keywords scoring +#' @export +add_coverage <- function(data) { + stored_attributes <- get_scoringutils_attributes(data) + data <- validate(data) + forecast_unit <- get_forecast_unit(data) + data_cols <- colnames(data) # store so we can reset column order later + + # what happens if quantiles are not symmetric around the median? + # should things error? Also write tests for that. + interval_data <- quantile_to_interval(data, format = "wide") + interval_data[, interval_coverage := ifelse( + observed <= upper & observed >= lower, + TRUE, + FALSE) + ][, c("lower", "upper", "observed") := NULL] + + data[, range := get_range_from_quantile(quantile)] + + data <- merge(interval_data, data, by = unique(c(forecast_unit, "range"))) + data[, interval_coverage_deviation := interval_coverage - range / 100] + data[, quantile_coverage := observed <= predicted] + data[, quantile_coverage_deviation := quantile_coverage - quantile] + + # reset column order + new_metrics <- c("interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation") + setcolorder(data, unique(c(data_cols, "range", new_metrics))) + + # add coverage "metrics" to list of stored metrics + # this makes it possible to use `summarise_scores()` later on + stored_attributes[["metric_names"]] <- c( + stored_attributes[["metric_names"]], + new_metrics + ) + data <- assign_attributes(data, stored_attributes) + return(data[]) +} diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R index cfaa24b2c..95869f02b 100644 --- a/R/check-input-helpers.R +++ b/R/check-input-helpers.R @@ -297,12 +297,22 @@ check_columns_present <- function(data, columns) { } assert_character(columns, min.len = 1) colnames <- colnames(data) + missing <- list() for (x in columns){ if (!(x %in% colnames)) { - msg <- paste0("Column '", x, "' not found in data") - return(msg) + missing[[x]] <- x } } + missing <- unlist(missing) + if (length(missing) > 1) { + msg <- paste0( + "Columns '", paste(missing, collapse = "', '"), "' not found in data" + ) + return(msg) + } else if (length(missing) == 1) { + msg <- paste0("Column '", missing, "' not found in data") + return(msg) + } return(TRUE) } diff --git a/R/check-inputs-scoring-functions.R b/R/check-inputs-scoring-functions.R index 350a3af4f..0e4467e3c 100644 --- a/R/check-inputs-scoring-functions.R +++ b/R/check-inputs-scoring-functions.R @@ -50,14 +50,20 @@ check_input_sample <- function(observed, predicted) { #' @param quantile Input to be checked. Should be a vector of size N that #' denotes the quantile levels corresponding to the columns of the prediction #' matrix. -#' @importFrom checkmate assert assert_numeric check_matrix +#' @param unique_quantiles Input to be checked. Should be TRUE (default) or +#' FALSE. Whether the quantile levels are required to be unique or not. +#' @importFrom checkmate assert assert_numeric check_matrix check_vector #' @inherit document_assert_functions return #' @keywords internal -assert_input_quantile <- function(observed, predicted, quantile) { +assert_input_quantile <- function(observed, predicted, quantile, + unique_quantiles = TRUE) { assert_numeric(observed, min.len = 1) n_obs <- length(observed) - assert_numeric(quantile, min.len = 1, lower = 0, upper = 1) + assert_numeric( + quantile, min.len = 1, lower = 0, upper = 1, + unique = unique_quantiles + ) n_quantiles <- length(quantile) if (n_obs == 1) { assert( @@ -66,6 +72,7 @@ assert_input_quantile <- function(observed, predicted, quantile) { check_matrix(predicted, mode = "numeric", nrows = n_obs, ncols = n_quantiles) ) + assert(check_vector(quantile, len = length(predicted))) } else { assert( check_matrix(predicted, mode = "numeric", @@ -85,6 +92,66 @@ check_input_quantile <- function(observed, predicted, quantile) { } +#' @title Assert that inputs are correct for interval-based forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring interval-based forecasts. +#' @param observed Input to be checked. Should be a numeric vector with the +#' observed values of size n +#' @param lower Input to be checked. Should be a numeric vector of size n that +#' holds the predicted value for the lower bounds of the prediction intervals. +#' @param upper Input to be checked. Should be a numeric vector of size n that +#' holds the predicted value for the upper bounds of the prediction intervals. +#' @param range Input to be checked. Should be a vector of size n that +#' denotes the interval range in percent. E.g. a value of 50 denotes a +#' (25%, 75%) prediction interval. +#' @importFrom rlang warn +#' @inherit document_assert_functions return +#' @keywords internal +assert_input_interval <- function(observed, lower, upper, range) { + + assert(check_numeric_vector(observed, min.len = 1)) + n <- length(observed) + assert(check_numeric_vector(lower, len = n)) + assert(check_numeric_vector(upper, len = n)) + assert( + check_numeric_vector(range, len = 1, lower = 0, upper = 100), + check_numeric_vector(range, len = n, lower = 0, upper = 100) + ) + + diff <- upper - lower + diff <- diff[!is.na(diff)] + if (any(diff < 0)) { + stop( + "All values in `upper` need to be greater than or equal to ", + "the corresponding values in `lower`" + ) + } + if (any(range > 0 & range < 1, na.rm = TRUE)) { + msg <- paste( + "Found interval ranges between 0 and 1. Are you sure that's right? An", + "interval range of 0.5 e.g. implies a (49.75%, 50.25%) prediction", + "interval. If you want to score a (25%, 75%) prediction interval, set", + "`interval_range = 50`." + ) + rlang::warn( + message = msg, .frequency = "once", + .frequency_id = "small_interval_range" + ) + } + return(invisible(NULL)) +} + + +#' @title Check that inputs are correct for interval-based forecast +#' @inherit assert_input_interval params description +#' @inherit check_input_sample return description +#' @keywords check-inputs +check_input_interval <- function(observed, lower, upper, range) { + result <- check_try(assert_input_quantile(observed, lower, upper, range)) + return(result) +} + + #' @title Assert that inputs are correct for binary forecast #' @description Function assesses whether the inputs correspond to the #' requirements for scoring binary forecasts. diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 870e4ac3d..31f3ab5ab 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -235,21 +235,13 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' example_quantile, #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) - set_forecast_unit <- function(data, forecast_unit) { - - datacols <- colnames(data) - missing <- forecast_unit[!(forecast_unit %in% datacols)] - - if (length(missing) > 0) { - warning( - "Column(s) '", - missing, - "' are not columns of the data and will be ignored." - ) - forecast_unit <- intersect(forecast_unit, datacols) + data <- ensure_data.table(data) + missing <- check_columns_present(data, forecast_unit) + if (!is.logical(missing)) { + warning(missing) + forecast_unit <- intersect(forecast_unit, colnames(data)) } - keep_cols <- c(get_protected_columns(data), forecast_unit) out <- unique(data[, .SD, .SDcols = keep_cols])[] return(out) diff --git a/R/data.R b/R/data.R index 520a44cac..c57e044d8 100644 --- a/R/data.R +++ b/R/data.R @@ -19,7 +19,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_quantile" @@ -44,7 +44,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_point" @@ -69,7 +69,7 @@ #' \item{predicted}{predicted value} #' \item{sample_id}{id for the corresponding sample} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_continuous" @@ -124,7 +124,7 @@ #' \item{horizon}{forecast horizon in weeks} #' \item{predicted}{predicted value} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_binary" @@ -147,7 +147,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_quantile_forecasts_only" @@ -167,7 +167,7 @@ #' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_truth_only" #' Summary information for selected metrics @@ -211,3 +211,18 @@ #' - "se_mean" = [se_mean_sample()] #' @keywords info "metrics_sample" + +#' Default metrics for quantile-based forecasts. +#' +#' A named list with functions: +#' - "wis" = [wis()] +#' - "overprediction" = [overprediction()] +#' - "underprediction" = [underprediction()] +#' - "dispersion" = [dispersion()] +#' - "bias" = [bias_quantile()] +#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = [interval_coverage_quantile][interval_coverage_quantile()])} +#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = [interval_coverage_quantile][interval_coverage_quantile()])} +#' - "coverage_deviation" = [interval_coverage_deviation_quantile()], +#' - "ae_median" = [ae_median_quantile()] +#' @keywords info +"metrics_quantile" diff --git a/R/get_-functions.R b/R/get_-functions.R index 22aaa47a9..b55e1ef4f 100644 --- a/R/get_-functions.R +++ b/R/get_-functions.R @@ -193,6 +193,8 @@ get_protected_columns <- function(data = NULL) { protected_columns <- c( "predicted", "observed", "sample_id", "quantile", "upper", "lower", "pit_value", "range", "boundary", "relative_skill", "scaled_rel_skill", + "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R index c9e3afb42..50f729f15 100644 --- a/R/metrics-quantile.R +++ b/R/metrics-quantile.R @@ -1,124 +1,325 @@ -#' @title Quantile Score -#' +################################################################################ +# Metrics with a many-to-one relationship between input and score +################################################################################ + +#' Weighted Interval Score (WIS) #' @description -#' Proper Scoring Rule to score quantile predictions. Smaller values are better. -#' The quantile score is -#' closely related to the Interval score (see [interval_score()]) and is -#' the quantile equivalent that works with single quantiles instead of -#' central prediction intervals. +#' The WIS is a proper scoring rule used to evaluate forecasts in an interval- / +#' quantile-based format. See Bracher et al. (2021). Smaller values are better. #' -#' @param quantile vector of size n with the quantile levels of the -#' corresponding predictions. -#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged -#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the -#' value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided -#' and will be computed from the quantile. Alpha is the decimal value that -#' represents how much is outside a central prediction interval (E.g. for a -#' 90 percent central prediction interval, alpha is 0.1). Default: `TRUE`. -#' @return vector with the scoring values -#' @inheritParams interval_score -#' @inheritParams ae_median_sample -#' @examples -#' observed <- rnorm(10, mean = 1:10) -#' alpha <- 0.5 +#' As the name suggest the score assumes that a forecast comes in the form of +#' one or multiple central prediction intervals. A prediction interval is +#' characterised by a lower and an upper bound formed by a pair of predictive +#' quantiles. For example, a 50% central prediction interval is formed by the +#' 0.25 and 0.75 quantiles of the predictive distribution. #' -#' lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) -#' upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) +#' **Interval score** #' -#' qs_lower <- quantile_score(observed, -#' predicted = lower, -#' quantile = alpha / 2 -#' ) -#' qs_upper <- quantile_score(observed, -#' predicted = upper, -#' quantile = 1 - alpha / 2 -#' ) -#' interval_score <- (qs_lower + qs_upper) / 2 -#' @export -#' @keywords metric -#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, -#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American -#' Statistical Association, Volume 102, 2007 - Issue 477 +#' The interval score (IS) is the sum of three components: +#' overprediction, underprediction and dispersion. For a single prediction +#' interval only one of the components is non-zero. If for a single prediction +#' interval the observed value is below the lower bound, then the interval +#' score is equal to the absolute difference between the lower bound and the +#' observed value ("underprediction"). "Overprediction" is defined analogously. +#' If the observed value falls within the bounds of the prediction interval, +#' then the interval score is equal to the width of the prediction interval, +#' i.e. the difference between the upper and lower bound. For a single interval, +#' we therefore have: #' -#' Evaluating epidemic forecasts in an interval format, -#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, -#' -# +#' \deqn{ +#' \textrm{IS} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} +#' - \textrm{observed}) * +#' \mathbf{1}(\textrm{observed} < \textrm{lower}) + +#' \frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +#' \mathbf{1}(\textrm{observed} > \textrm{upper}) +#' }{ +#' score = (upper - lower) + 2/alpha * (lower - observed) * +#' 1(observed < lower) + 2/alpha * (observed - upper) * +#' 1(observed > upper) +#' } +#' where \eqn{\mathbf{1}()}{1()} is the indicator function and +#' indicates how much is outside the prediction interval. +#' \eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +#' the prediction interval. For a 90% prediction interval, for example, +#' \eqn{\alpha}{alpha} is equal to 0.1. No specific distribution is assumed, +#' but the range has to be symmetric (i.e you can't use the 0.1 quantile +#' as the lower bound and the 0.7 quantile as the upper). +#' Non-symmetric quantiles can be scored using the function [quantile_score()]. +#' +#' Usually the interval score is weighted by a factor that makes sure that the +#' average score across an increasing number of equally spaced +#' quantiles, converges to the continuous ranked probability score (CRPS). This +#' weighted score is called the weihted interval score (WIS). +#' The weight commonly used is \eqn{\alpha / 2}{alpha / 2}. +#' +#' **Quantile score** +#' +#' In addition to the interval score, there also exists a quantile score (QS) +#' (see [quantile_score()]), which is equal to the so-called pinball loss. +#' The quantile score can be computed for a single quantile (whereas the +#' interval score requires two quantiles that form an interval). However, +#' the intuitive decomposition into overprediction, underprediction and +#' dispersion does not exist for the quantile score. +#' +#' **Two versions of the weighted interval score** +#' +#' There are two ways to conceptualise the weighted interval score across +#' several quantiles / prediction intervals and the median. +#' +#' In one view, you would treat the WIS as the average of quantile scores (and +#' the median as 0.5-quantile) (this is the default for `wis()`). In another +#' view, you would treat the WIS as the average of several interval scores + +#' the difference between observed value and median forecast. The effect of +#' that is that in contrast to the first view, the median has twice as much +#' weight (because it is weighted like a prediction interval, rather than like +#' a single quantile). Both are valid ways to conceptualise the WIS and you +#' can control the behvaviour with the `count_median_twice`-argument. +#' +#' **WIS components**: +#' WIS components can be computed individually using the functions +#' `overprediction`, `underprediction`, and `dispersion.` +#' +#' @inheritParams interval_score +#' @param observed numeric vector of size n with the observed values +#' @param predicted numeric nxN matrix of predictive +#' quantiles, n (number of rows) being the number of forecasts (corresponding +#' to the number of observed values) and N +#' (number of columns) the number of quantiles per forecast. +#' If `observed` is just a single number, then predicted can just be a +#' vector of size N. +#' @param quantile vector with quantile levels of size N +#' @param count_median_twice if TRUE, count the median twice in the score +#' @param na.rm if TRUE, ignore NA values when computing the score +#' @importFrom stats weighted.mean +#' @importFrom checkmate assert_logical +#' @return +#' `wis()`: a numeric vector with WIS values of size n (one per observation), +#' or a list with separate entries if `separate_results` is `TRUE`. +#' @export +wis <- function(observed, + predicted, + quantile, + separate_results = FALSE, + weigh = TRUE, + count_median_twice = FALSE, + na.rm = TRUE) { + assert_input_quantile(observed, predicted, quantile) + reformatted <- quantile_to_interval(observed, predicted, quantile) -quantile_score <- function(observed, - predicted, - quantile, - weigh = TRUE) { + assert_logical(separate_results, len = 1) + assert_logical(weigh, len = 1) + assert_logical(count_median_twice, len = 1) + assert_logical(na.rm, len = 1) - # get central prediction interval which corresponds to given quantiles - central_interval <- abs(0.5 - quantile) * 2 - alpha <- 1 - central_interval + if (separate_results) { + cols <- c("wis", "dispersion", "underprediction", "overprediction") + } else { + cols <- "wis" + } - # compute score - this is the version explained in the SI of Bracher et. al. - error <- abs(predicted - observed) - score <- 2 * ifelse( - observed <= predicted, 1 - quantile, quantile - ) * error + reformatted[, eval(cols) := do.call( + interval_score, + list(observed = observed, + lower = lower, + upper = upper, + interval_range = range, + weigh = weigh, + separate_results = separate_results + ) + )] - # adapt score such that mean of unweighted quantile scores corresponds to - # unweighted interval score of the corresponding prediction interval - score <- 2 * score / alpha + if (!count_median_twice) { + reformatted[, weight := ifelse(range == 0, 0.5, 1)] + } else { + reformatted[, weight := 1] + } - if (weigh) { - score <- score * alpha / 2 + # summarise results by forecast_id + reformatted <- reformatted[ + , lapply(.SD, weighted.mean, na.rm = na.rm, w = weight), + by = c("forecast_id"), + .SDcols = colnames(reformatted) %like% paste(cols, collapse = "|") + ] + + if (separate_results) { + return(list( + wis = reformatted$wis, + dispersion = reformatted$dispersion, + underprediction = reformatted$underprediction, + overprediction = reformatted$overprediction + )) + } else { + return(reformatted$wis) } +} - return(score) + +#' @return +#' `dispersion()`: a numeric vector with dispersion values (one per observation) +#' @param ... Additional arguments passed on to `wis()` from functions +#' `overprediction()`, `underprediction()` and `dispersion()` +#' @export +#' @rdname wis +dispersion <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$dispersion } -#' @title Absolute Error of the Median (Quantile-based Version) +#' @return +#' `overprediction()`: a numeric vector with overprediction values (one per +#' observation) +#' @export +#' @rdname wis +overprediction <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$overprediction +} + + +#' @return +#' `underprediction()`: a numeric vector with underprediction values (one per +#' observation) +#' @export +#' @rdname wis +underprediction <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$underprediction +} + + +#' @title Interval Coverage (For Quantile-Based Forecasts) +#' @description Check whether the observed value is within a given central +#' prediction interval. The prediction interval is defined by a lower and an +#' upper bound formed by a pair of predictive quantiles. For example, a 50% +#' prediction interval is formed by the 0.25 and 0.75 quantiles of the +#' predictive distribution. +#' @inheritParams wis +#' @param range A single number with the range of the prediction interval in +#' percent (e.g. 50 for a 50% prediction interval) for which you want to compute +#' coverage. +#' @importFrom checkmate assert_number +#' @return A vector of length n with TRUE if the observed value is within the +#' corresponding prediction interval and FALSE otherwise. +#' @name interval_coverage +#' @export +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' interval_coverage_quantile(observed, predicted, quantile) +interval_coverage_quantile <- function(observed, predicted, quantile, range = 50) { + assert_input_quantile(observed, predicted, quantile) + assert_number(range) + necessary_quantiles <- c((100 - range) / 2, 100 - (100 - range) / 2) / 100 + if (!all(necessary_quantiles %in% quantile)) { + warning( + "To compute the coverage for a range of ", range, "%, the quantiles ", + necessary_quantiles, " are required. Returning `NA`.") + return(NA) + } + r <- range + reformatted <- quantile_to_interval(observed, predicted, quantile) + reformatted <- reformatted[range %in% r] + reformatted[, coverage := ifelse( + observed >= lower & observed <= upper, TRUE, FALSE + )] + return(reformatted$coverage) +} + + +#' @title Interval Coverage Deviation (For Quantile-Based Forecasts) +#' @description Check the agreement between desired and actual interval coverage +#' of a forecast. #' -#' @description -#' Absolute error of the median calculated as +#' The function is similar to [interval_coverage_quantile()], +#' but looks at all provided prediction intervals instead of only one. It +#' compares nominal coverage (i.e. the desired coverage) with the actual +#' observed coverage. +#' +#' A central symmetric prediction interval is defined by a lower and an +#' upper bound formed by a pair of predictive quantiles. For example, a 50% +#' prediction interval is formed by the 0.25 and 0.75 quantiles of the +#' predictive distribution. Ideally, a forecaster should aim to cover about +#' 50% of all observed values with their 50% prediction intervals, 90% of all +#' observed values with their 90% prediction intervals, and so on. +#' +#' For every prediction interval, the deviation is computed as the difference +#' between the observed coverage and the nominal coverage +#' For a single observed value and a single prediction interval, +#' coverage is always either 0 or 1. This is not the case for a single observed +#' value and multiple prediction intervals, but it still doesn't make that much +#' sense to compare nominal (desired) coverage and actual coverage for a single +#' observation. In that sense coverage deviation only really starts to make +#' sense as a metric when averaged across multiple observations). +#' +#' Positive values of coverage deviation are an indication for underconfidence, +#' i.e. the forecaster could likely have issued a narrower forecast. Negative +#' values are an indication for overconfidence, i.e. the forecasts were too +#' narrow. #' #' \deqn{ -#' \textrm{abs}(\textrm{observed} - \textrm{prediction}) +#' \textrm{coverage deviation} = +#' \mathbf{1}(\textrm{observed value falls within interval} - +#' \textrm{nominal coverage}) #' }{ -#' abs(observed - median_prediction) +#' coverage deviation = +#' 1(observed value falls within interval) - nominal coverage #' } -#' -#' The function was created for internal use within [score()], but can also -#' used as a standalone function. -#' -#' @param predicted numeric vector with predictions, corresponding to the -#' quantiles in a second vector, `quantiles`. -#' @param quantiles numeric vector that denotes the quantile for the values -#' in `predicted`. Only those predictions where `quantiles == 0.5` will -#' be kept. If `quantiles` is `NULL`, then all `predicted` and -#' `observed` will be used (this is then the same as [abs_error()]) -#' @return vector with the scoring values -#' @seealso [ae_median_sample()], [abs_error()] -#' @importFrom stats median -#' @inheritParams ae_median_sample -#' @examples -#' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_quantile(observed, predicted_values, quantiles = 0.5) +#' The coverage deviation is then averaged across all prediction intervals. +#' The median is ignored when computing coverage deviation. +#' @inheritParams wis +#' @return A numeric vector of length n with the coverage deviation for each +#' forecast (comprising one or multiple prediction intervals). #' @export -#' @keywords metric +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' interval_coverage_deviation_quantile(observed, predicted, quantile) +interval_coverage_deviation_quantile <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) -ae_median_quantile <- function(observed, predicted, quantiles = NULL) { - if (!is.null(quantiles)) { - if (!any(quantiles == 0.5) && !anyNA(quantiles)) { - return(NA_real_) - warning( - "in order to compute the absolute error of the median, `0.5` must be ", - "among the quantiles given. Maybe you want to use `abs_error()`?" - ) - } - observed <- observed[quantiles == 0.5] - predicted <- predicted[quantiles == 0.5] + # transform available quantiles into central interval ranges + available_ranges <- unique(get_range_from_quantile(quantile)) + + # check if all necessary quantiles are available + necessary_quantiles <- unique(c( + (100 - available_ranges) / 2, + 100 - (100 - available_ranges) / 2) / 100 + ) + if (!all(necessary_quantiles %in% quantile)) { + missing <- necessary_quantiles[!necessary_quantiles %in% quantile] + warning( + "To compute coverage deviation, all quantiles must form central ", + "symmetric prediction intervals. Missing quantiles: ", + toString(missing), ". Returning `NA`.") + return(NA) } - abs_error_median <- abs(observed - predicted) - return(abs_error_median) -} + reformatted <- quantile_to_interval(observed, predicted, quantile)[range != 0] + reformatted[, coverage := ifelse( + observed >= lower & observed <= upper, TRUE, FALSE + )] + reformatted[, coverage_deviation := coverage - range / 100] + out <- reformatted[, .(coverage_deviation = mean(coverage_deviation)), + by = c("forecast_id")] + return(out$coverage_deviation) +} #' @title Determines Bias of Quantile Forecasts @@ -163,6 +364,7 @@ ae_median_quantile <- function(observed, predicted, quantiles = NULL) { #' which predictions were made. If this does not contain the median (0.5) then #' the median is imputed as being the mean of the two innermost quantiles. #' @inheritParams bias_range +#' @param na.rm logical. Should missing values be removed? #' @return scalar with the quantile bias for a single quantile prediction #' @author Nikos Bosse \email{nikosbosse@@gmail.com} #' @examples @@ -181,40 +383,62 @@ ae_median_quantile <- function(observed, predicted, quantiles = NULL) { #' bias_quantile(observed, predicted, quantile) #' @export #' @keywords metric - -bias_quantile <- function(observed, predicted, quantile) { - # check that predictions and quantile have the same length - if (!length(predicted) == length(quantile)) { - stop("`predicted` and `quantile` must have the same length") +bias_quantile <- function(observed, predicted, quantile, na.rm = TRUE) { + assert_input_quantile(observed, predicted, quantile) + n <- length(observed) + N <- length(quantile) + if (is.null(dim(predicted))) { + dim(predicted) <- c(n, N) } + bias <- sapply(1:n, function(i) { + bias_quantile_single_vector(observed[i], predicted[i,], quantile, na.rm) + }) + return(bias) +} - if (anyNA(predicted)) { - quantile <- quantile[!is.na(predicted)] - predicted <- predicted[!is.na(predicted)] - } - if (anyNA(quantile)) { - quantile <- quantile[!is.na(quantile)] - predicted <- predicted[!is.na(quantile)] - } +#' Compute Bias for a Single Vector of Quantile Predictions +#' @description Internal function to compute bias for a single observed value, +#' a vector of predicted values and a vector of quantiles. +#' @param observed scalar with the observed value +#' @param predicted vector of length N corresponding to the number of quantiles +#' that holds predictions +#' @param quantile vector of corresponding size N with the quantile levels for +#' which predictions were made. If this does not contain the median (0.5) then +#' the median is imputed as being the mean of the two innermost quantiles. +#' @inheritParams bias_quantile +#' @return scalar with the quantile bias for a single quantile prediction +bias_quantile_single_vector <- function(observed, predicted, quantile, na.rm) { - # if there is no input, return NA - if (length(quantile) == 0 || length(predicted) == 0) { - return(NA_real_) - } + assert_number(observed) + # other checks should have happend before - check_quantiles(quantile) + predicted_has_NAs <- anyNA(predicted) + quantile_has_NAs <- anyNA(quantile) + + if(any(predicted_has_NAs, quantile_has_NAs)) { + if (!na.rm) { + return(NA_real_) + } else { + quantile <- quantile[!is.na(predicted)] + predicted <- predicted[!is.na(predicted)] + predicted <- predicted[!is.na(quantile)] + quantile <- quantile[!is.na(quantile)] + } + } + order <- order(quantile) + predicted <- predicted[order] if (!all(diff(predicted) >= 0)) { - stop("predictions must be increasing with quantiles") + stop("Predictions must not be decreasing with increasing quantile level") } if (0.5 %in% quantile) { median_prediction <- predicted[quantile == 0.5] } else { - # if median is not available, compute as mean of two innermost quantiles + # if median is not available, compute as mean of two innermost quantile message( - "Median not available, computing as mean of two innermost quantiles", + "Median not available, computing as mean of two innermost quantile", " in order to compute bias." ) median_prediction <- @@ -242,3 +466,209 @@ bias_quantile <- function(observed, predicted, quantile) { } return(bias) } + + +#' @title Absolute Error of the Median (Quantile-based Version) +#' @description +#' Compute the absolute error of the median calculated as +#' \deqn{ +#' \textrm{abs}(\textrm{observed} - \textrm{median prediction}) +#' }{ +#' abs(observed - median_prediction) +#' } +#' The median prediction is the predicted value for which quantile == 0.5, +#' the function therefore requires 0.5 to be among the quantile levels in +#' `quantile`. +#' @inheritParams wis +#' @return numeric vector of length N with the absolute error of the median +#' @seealso [ae_median_sample()], [abs_error()] +#' @importFrom stats median +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) +#' ae_median_quantile(observed, predicted_values, quantile = 0.5) +#' @export +#' @keywords metric +ae_median_quantile <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) + if (!any(quantile == 0.5)) { + warning( + "in order to compute the absolute error of the median, `0.5` must be ", + "among the quantiles given. Returning `NA`." + ) + return(NA_real_) + } + if (is.null(dim(predicted))) { + predicted <- matrix(predicted, nrow = 1) + } + predicted <- predicted[, quantile == 0.5] + abs_error_median <- abs(observed - predicted) + return(abs_error_median) +} + + +################################################################################ +# Metrics with a one-to-one relationship between input and score +################################################################################ + + +#' @title Quantile Score +#' +#' @description +#' Proper Scoring Rule to score quantile predictions. Smaller values are better. +#' The quantile score is +#' closely related to the Interval score (see [interval_score()]) and is +#' the quantile equivalent that works with single quantiles instead of +#' central prediction intervals. +#' +#' @param quantile vector of size n with the quantile levels of the +#' corresponding predictions. +#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged +#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the +#' value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided +#' and will be computed from the quantile. Alpha is the decimal value that +#' represents how much is outside a central prediction interval (E.g. for a +#' 90 percent central prediction interval, alpha is 0.1). Default: `TRUE`. +#' @return vector with the scoring values +#' @inheritParams interval_score +#' @inheritParams ae_median_sample +#' @examples +#' observed <- rnorm(10, mean = 1:10) +#' alpha <- 0.5 +#' +#' lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) +#' upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) +#' +#' qs_lower <- quantile_score(observed, +#' predicted = lower, +#' quantile = alpha / 2 +#' ) +#' qs_upper <- quantile_score(observed, +#' predicted = upper, +#' quantile = 1 - alpha / 2 +#' ) +#' interval_score <- (qs_lower + qs_upper) / 2 +#' @export +#' @keywords metric +#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, +#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American +#' Statistical Association, Volume 102, 2007 - Issue 477 +#' +#' Evaluating epidemic forecasts in an interval format, +#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, +#' +quantile_score <- function(observed, + predicted, + quantile, + weigh = TRUE) { + + # compute score - this is the version explained in the SI of Bracher et. al. + error <- abs(predicted - observed) + score <- 2 * ifelse( + observed <= predicted, 1 - quantile, quantile + ) * error + + # adapt score such that mean of unweighted quantile scores corresponds to + # unweighted interval score of the corresponding prediction interval + # --> needs central prediction interval which corresponds to given quantiles + central_interval <- abs(0.5 - quantile) * 2 + alpha <- 1 - central_interval + score <- 2 * score / alpha + + # if weigh, then reverse last operation + if (weigh) { + score <- score * alpha / 2 + } + + return(score) +} + + +# Weighted Interval Score, But With One-to-One Relationship +wis_one_to_one <- function(observed, + predicted, + quantile, + separate_results = FALSE, + output = c("matrix", "data.frame", "vector"), + weigh = TRUE) { + + # input checks + assert_input_quantile(observed, predicted, quantile) + + # store original data + n <- length(observed) + N <- length(quantile) + original_data <- data.table( + forecast_id = rep(1:n, each = N), + observed = rep(observed, each = N), + predicted = as.vector(t(predicted)), + quantile = quantile + ) + + # define output columns + if (separate_results) { + cols <- c("wis", "dispersion", "underprediction", "overprediction") + } else { + cols <- "wis" + } + + # reformat input to interval format and calculate interval score + reformatted <- quantile_to_interval(observed, predicted, quantile) + reformatted[, eval(cols) := do.call( + interval_score, + list(observed = observed, + lower = lower, + upper = upper, + interval_range = range, + weigh = weigh, + separate_results = separate_results + ) + )] + + # melt data to long format, calclate quantiles, and merge back to original + long <- melt(reformatted, + measure.vars = c("lower", "upper"), + variable.name = "boundary", + value.name = "predicted", + id.vars = c("forecast_id", "observed", "range", cols)) + # calculate quantiles + long[, quantile := (100 - range) / 200] # lower quantiles + long[boundary == "upper", quantile := 1 - quantile] # upper quantiles + # remove boundary, range, take unique value to get rid of duplicated median + long[, c("boundary", "range") := NULL] + long <- unique(long) # should maybe check for count_median_twice? + out <- merge( + original_data, long, all.x = TRUE, + by = c("forecast_id", "observed", "predicted", "quantile") + )[, forecast_id := NULL] + + # handle returns depending on the output format + if (output == "data.frame") { + return(out) + } + + wis <- out$wis + if (separate_results) { + components <- list( + underprediction = out$underprediction, + overprediction = out$overprediction, + dispersion = out$dispersion + ) + } + + if (output == "vector" && separate_results) { + return(c(wis = wis, components)) + } else if (output == "vector") { + return(wis) + } + + if (output == "matrix") { + wis <- matrix(wis, nrow = n, ncol = N) + if (separate_results) { + components <- lapply(components, function(x) matrix(x, nrow = n, ncol = N)) + return(c(wis, components)) + } else { + return(wis) + } + } +} diff --git a/R/metrics-range.R b/R/metrics-range.R index fe8f54cab..5263d8962 100644 --- a/R/metrics-range.R +++ b/R/metrics-range.R @@ -1,3 +1,7 @@ +################################################################################ +# Metrics with a one-to-one relationship between input and score +################################################################################ + #' @title Interval Score #' #' @description @@ -59,7 +63,7 @@ #' interval_range <- rep(90, 30) #' alpha <- (100 - interval_range) / 100 #' lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -#' upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) +#' upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 11:40)) #' #' interval_score( #' observed = observed, @@ -69,7 +73,7 @@ #' ) #' #' # gives a warning, as the interval_range should likely be 50 instead of 0.5 -#' interval_score(observed = 4, upper = 2, lower = 8, interval_range = 0.5) +#' interval_score(observed = 4, upper = 8, lower = 2, interval_range = 0.5) #' #' # example with missing values and separate results #' interval_score( @@ -97,31 +101,7 @@ interval_score <- function(observed, weigh = TRUE, separate_results = FALSE) { - # error handling - not sure how I can make this better - present <- c( - methods::hasArg("observed"), methods::hasArg("lower"), - methods::hasArg("upper"), methods::hasArg("interval_range") - ) - if (!all(present)) { - stop( - "need all arguments 'observed', 'lower', 'upper' and 'interval_range' in function 'interval_score()'" # nolint - ) - } - assert_not_null( - observed = observed, lower = lower, upper = upper, - interval_range = interval_range - ) - assert_equal_length(observed, lower, interval_range, upper) - - if (any(interval_range < 0, na.rm = TRUE)) { - stop("interval ranges must be positive") - } - if (any(interval_range > 0 & interval_range < 1, na.rm = TRUE)) { - msg <- paste("Found interval ranges between 0 and 1. Are you sure that's right?", - "An interval range of 0.5 e.g. implies a (49.75%, 50.25%) prediction interval.", - "If you want to score a (25%, 75%) prediction interval, set interval_range = 50.") - rlang::warn(message = msg, .frequency = "once", .frequency_id = "small_interval_range") - } + assert_input_interval(observed, lower, upper, interval_range) # calculate alpha from the interval range alpha <- (100 - interval_range) / 100 @@ -154,6 +134,9 @@ interval_score <- function(observed, } +################################################################################ +# Metrics with a many-to-one relationship between input and score +################################################################################ #' @title Determines Bias of Quantile Forecasts based on the range of the #' prediction intervals diff --git a/R/metrics-sample.R b/R/metrics-sample.R index 67291bffa..803fb4c4e 100644 --- a/R/metrics-sample.R +++ b/R/metrics-sample.R @@ -96,18 +96,17 @@ bias_sample <- function(observed, predicted) { #' @importFrom stats median #' @examples #' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) #' ae_median_sample(observed, predicted_values) #' @export #' @keywords metric ae_median_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) median_predictions <- apply( - as.matrix(predicted), MARGIN = 1, FUN = median # this is rowwise + as.matrix(predicted), MARGIN = 1, FUN = median # this is row-wise ) - ae_median <- abs(observed - median_predictions) - return(ae_median) } @@ -118,11 +117,11 @@ ae_median_sample <- function(observed, predicted) { #' Squared error of the mean calculated as #' #' \deqn{ -#' \textrm{mean}(\textrm{observed} - \textrm{prediction})^2 +#' \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 #' }{ -#' mean(observed - mean_prediction)^2 +#' mean(observed - mean prediction)^2 #' } -#' +#' The mean prediction is calculated as the mean of the predictive samples. #' @param observed A vector with observed values of size n #' @param predicted nxN matrix of predictive samples, n (number of rows) being #' the number of data points and N (number of columns) the number of Monte @@ -131,12 +130,13 @@ ae_median_sample <- function(observed, predicted) { #' @seealso [squared_error()] #' @examples #' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) #' se_mean_sample(observed, predicted_values) #' @export #' @keywords metric se_mean_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) mean_predictions <- rowMeans(as.matrix(predicted)) se_mean <- (observed - mean_predictions)^2 @@ -283,3 +283,39 @@ mad_sample <- function(observed = NULL, predicted, ...) { sharpness <- apply(predicted, MARGIN = 1, mad, ...) return(sharpness) } + + +#' @title Interval Coverage +#' @description To compute coverage for sample-based forecasts, +#' predictive samples are converted first into predictive quantiles using the +#' sample quantiles. +#' @importFrom checkmate assert_number +#' @rdname interval_coverage +#' @export +#' @examples +#' interval_coverage_sample(observed, predicted) +interval_coverage_sample <- function(observed, predicted, range = 50) { + assert_input_sample(observed, predicted) + assert_number(range) + necessary_quantiles <- c((100 - range) / 2, 100 - (100 - range) / 2) / 100 + + # this could be its own function, sample_to_quantile.numeric + # ========================================================== + n <- length(observed) + N <- length(predicted) / n + dt <- data.table( + observed = rep(observed, each = N), + predicted = as.vector(t(predicted)) + ) + quantile_dt <- sample_to_quantile(dt, necessary_quantiles) + # ========================================================== + + # this could call interval_coverage_quantile instead + # ========================================================== + interval_dt <- quantile_to_interval(quantile_dt, format = "wide") + interval_dt[, coverage := ifelse( + observed >= lower & observed <= upper, TRUE, FALSE + )] + # ========================================================== + return(interval_dt$coverage) +} diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index f23316254..eab561adb 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -28,7 +28,7 @@ #' #' @param scores A data.table of scores as produced by [score()]. #' @param metric A character vector of length one with the metric to do the -#' comparison on. The default is "auto", meaning that either "interval_score", +#' comparison on. The default is "auto", meaning that either "wis", #' "crps", or "brier_score" will be selected where available. #' @param by character vector with names of columns present in the input #' data.frame. `by` determines how pairwise comparisons will be computed. @@ -366,8 +366,8 @@ compare_two_models <- function(scores, #' @keywords internal infer_rel_skill_metric <- function(scores) { - if ("interval_score" %in% colnames(scores)) { - rel_skill_metric <- "interval_score" + if ("wis" %in% colnames(scores)) { + rel_skill_metric <- "wis" } else if ("crps" %in% colnames(scores)) { rel_skill_metric <- "crps" } else if ("brier_score" %in% colnames(scores)) { diff --git a/R/pit.R b/R/pit.R index 2e00e4b90..ee5e09c7c 100644 --- a/R/pit.R +++ b/R/pit.R @@ -189,18 +189,10 @@ pit <- function(data, data <- remove_na_observed_predicted(data) forecast_type <- get_forecast_type(data) - # if prediction type is quantile, simply extract coverage values from - # score and returned a list with named vectors if (forecast_type == "quantile") { - coverage <- - score(data, metrics = "quantile_coverage") - - coverage <- summarise_scores(coverage, - by = unique(c(by, "quantile")) - ) - # remove all existing attributes and class - coverage <- remove_scoringutils_class(coverage) - + data[, quantile_coverage := (observed <= predicted)] + coverage <- data[, .(quantile_coverage = mean(quantile_coverage)), + by = c(unique(c(by, "quantile")))] coverage <- coverage[order(quantile), .( quantile = c(quantile, 1), @@ -208,7 +200,6 @@ pit <- function(data, ), by = c(get_forecast_unit(coverage)) ] - return(coverage[]) } diff --git a/R/plot.R b/R/plot.R index b25b81095..0ec4bb8c1 100644 --- a/R/plot.R +++ b/R/plot.R @@ -221,7 +221,7 @@ plot_wis <- function(scores, #' produced by [score()] or [summarise_scores()]. Note that "range" must be included #' in the `by` argument when running [summarise_scores()] #' @param y The variable from the scores you want to show on the y-Axis. -#' This could be something like "interval_score" (the default) or "dispersion" +#' This could be something like "wis" (the default) or "dispersion" #' @param x The variable from the scores you want to show on the x-Axis. #' Usually this will be "model" #' @param colour Character vector of length one used to determine a variable @@ -233,18 +233,18 @@ plot_wis <- function(scores, #' @export #' @examples #' library(ggplot2) -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "target_type", "range")) #' -#' plot_ranges(scores, x = "model") + -#' facet_wrap(~target_type, scales = "free") +#' # plot_ranges(scores, x = "model") + +#' # facet_wrap(~target_type, scales = "free") #' #' # visualise dispersion instead of interval score -#' plot_ranges(scores, y = "dispersion", x = "model") + -#' facet_wrap(~target_type) +#' # plot_ranges(scores, y = "dispersion", x = "model") + +#' # facet_wrap(~target_type) plot_ranges <- function(scores, - y = "interval_score", + y = "wis", x = "model", colour = "range") { plot <- ggplot( @@ -296,7 +296,7 @@ plot_ranges <- function(scores, #' @export #' @examples #' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +#' scores <- summarise_scores(scores, by = c("model", "target_type")) #' #' plot_heatmap(scores, x = "target_type", metric = "bias") @@ -582,10 +582,10 @@ make_na <- make_NA #' @importFrom data.table dcast #' @export #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "range")) -#' plot_interval_coverage(scores) +#' # data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "range")) +#' # plot_interval_coverage(scores) plot_interval_coverage <- function(scores, colour = "model") { @@ -613,7 +613,7 @@ plot_interval_coverage <- function(scores, colour = "grey", linetype = "dashed" ) + - geom_line(aes(y = coverage * 100)) + + geom_line(aes(y = interval_coverage * 100)) + theme_scoringutils() + ylab("% Obs inside interval") + xlab("Nominal interval coverage") + @@ -638,9 +638,9 @@ plot_interval_coverage <- function(scores, #' @importFrom data.table dcast #' @export #' @examples -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "quantile")) -#' plot_quantile_coverage(scores) +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "quantile")) +#' # plot_quantile_coverage(scores) plot_quantile_coverage <- function(scores, colour = "model") { diff --git a/R/score.R b/R/score.R index 4e235ade3..29230e6e0 100644 --- a/R/score.R +++ b/R/score.R @@ -152,18 +152,10 @@ score.scoringutils_binary <- function(data, metrics = metrics_binary, ...) { data <- remove_na_observed_predicted(data) metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) - - data[, (metric_name) := do.call( - fun, c(list(observed, predicted), matching_args) - )] - return() - }, ...) + data <- apply_metrics( + data, metrics, + data$observed, data$predicted, ... + ) setattr(data, "metric_names", names(metrics)) @@ -180,18 +172,10 @@ score.scoringutils_point <- function(data, metrics = metrics_point, ...) { data <- remove_na_observed_predicted(data) metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) - - data[, (metric_name) := do.call( - fun, c(list(observed, predicted), matching_args) - )] - return() - }, ...) + data <- apply_metrics( + data, metrics, + data$observed, data$predicted, ... + ) setattr(data, "metric_names", names(metrics)) @@ -206,54 +190,87 @@ score.scoringutils_sample <- function(data, metrics = metrics_sample, ...) { forecast_unit <- attr(data, "forecast_unit") metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) + # transpose the forecasts that belong to the same forecast unit + d_transposed <- data[, .(predicted = list(predicted), + observed = unique(observed), + scoringutils_N = length(list(sample_id))), + by = forecast_unit] - data[, (metric_name) := do.call( - fun, c(list(unique(observed), t(predicted)), matching_args) - ), by = forecast_unit] - return() - }, - ...) + # split according to number of samples and do calculations for different + # sample lengths separately + d_split <- split(d_transposed, d_transposed$scoringutils_N) - data <- data[ - , lapply(.SD, unique), - .SDcols = colnames(data) %like% paste(names(metrics), collapse = "|"), - by = forecast_unit - ] + split_result <- lapply(d_split, function(data) { + # create a matrix + observed <- data$observed + predicted <- do.call(rbind, data$predicted) + data[, c("observed", "predicted", "scoringutils_N") := NULL] + data <- apply_metrics( + data, metrics, + observed, predicted, ... + ) + return(data) + }) + data <- rbindlist(split_result) setattr(data, "metric_names", names(metrics)) return(data[]) } +#' @importFrom data.table `:=` as.data.table rbindlist %like% #' @rdname score #' @export -score.scoringutils_quantile <- function(data, metrics = NULL, ...) { +score.scoringutils_quantile <- function(data, metrics = metrics_quantile, ...) { data <- validate(data) data <- remove_na_observed_predicted(data) forecast_unit <- attr(data, "forecast_unit") + metrics <- validate_metrics(metrics) - if (is.null(metrics)) { - metrics <- available_metrics() - } - metrics <- metrics[metrics %in% available_metrics()] - scores <- score_quantile( - data = data, - forecast_unit = forecast_unit, - metrics = metrics, - ... - ) + # transpose the forecasts that belong to the same forecast unit + # make sure the quantiles and predictions are ordered in the same way + d_transposed <- data[, .(predicted = list(predicted[order(quantile)]), + observed = unique(observed), + quantile = list(quantile[order(quantile)]), + scoringutils_quantile = toString(quantile[order(quantile)])), + by = forecast_unit] + + # split according to quantile lengths and do calculations for different + # quantile lengths separately. The function `wis()` assumes that all + # forecasts have the same quantiles + d_split <- split(d_transposed, d_transposed$scoringutils_quantile) - setattr(scores, "metric_names", metrics[metrics %in% colnames(scores)]) - # manual hack to make sure that the correct attributes are there. - setattr(scores, "forecast_unit", forecast_unit) - setattr(scores, "forecast_type", "quantile") - scores <- new_scoringutils(scores, "scoringutils_quantile") + split_result <- lapply(d_split, function(data) { + # create a matrix out of the list of predicted values and quantiles + observed <- data$observed + predicted <- do.call(rbind, data$predicted) + quantile <- unlist(unique(data$quantile)) + data[, c("observed", "predicted", "quantile", "scoringutils_quantile") := NULL] - return(scores[]) + data <- apply_metrics( + data, metrics, + observed, predicted, quantile, ... + ) + return(data) + }) + + data <- rbindlist(split_result) + setattr(data, "metric_names", names(metrics)) + + return(data[]) +} + +apply_metrics <- function(data, metrics, ...) { + expr <- expression( + data[, (metric_name) := do.call(run_safely, list(..., fun = fun))] + ) + lapply(seq_along(metrics), function(i, data, ...) { + metric_name <- names(metrics[i]) + fun <- metrics[[i]] + eval(expr) + }, data, ...) + return(data) } + + + diff --git a/R/score_quantile.R b/R/score_quantile.R deleted file mode 100644 index f2da97f04..000000000 --- a/R/score_quantile.R +++ /dev/null @@ -1,167 +0,0 @@ -#' @title Evaluate forecasts in a Quantile-Based Format -#' -#' @inheritParams score -#' @inheritParams interval_score -#' @param count_median_twice logical that controls whether or not to count the -#' median twice when summarising (default is \code{FALSE}). Counting the -#' median twice would conceptually treat it as a 0\% prediction interval, where -#' the median is the lower as well as the upper bound. The alternative is to -#' treat the median as a single quantile forecast instead of an interval. The -#' interval score would then be better understood as an average of quantile -#' scores. -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast, i.e. a forecast was made for a combination -#' of the values in `forecast_unit` -#' -#' @return A data.table with appropriate scores. For more information see -#' [score()] -#' -#' @importFrom data.table ':=' as.data.table rbindlist %like% -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @inherit score references -#' @keywords internal - -score_quantile <- function(data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE) { - - data <- remove_na_observed_predicted(data) - - # make sure to have both quantile as well as range format -------------------- - range_data <- quantile_to_interval( - data, - keep_quantile_col = FALSE - ) - # adds the range column to the quantile data set - quantile_data <- range_long_to_quantile( - range_data, - keep_range_col = TRUE - ) - - # to deal with point forecasts in a quantile format. This in effect adds - # a third column next to lower and upper after pivoting - range_data[is.na(range), boundary := "point"] - - range_data <- data.table::dcast(range_data, ... ~ boundary, - value.var = "predicted" - ) - - # if we only score point forecasts, it may be true that there are no columns - # upper and lower in the data.frame. If so, these need to be added - if (!all(c("upper", "lower") %in% colnames(range_data))) { - range_data[, c("upper", "lower") := NA] - } - - # set up results data.table that will then be modified throughout ------------ - res <- data.table::copy(range_data) - - # calculate scores on range format ------------------------------------------- - if ("interval_score" %in% metrics) { - # compute separate results if desired - if (separate_results) { - outcols <- c( - "interval_score", "dispersion", - "underprediction", "overprediction" - ) - } else { - outcols <- "interval_score" - } - res <- res[, eval(outcols) := do.call( - scoringutils::interval_score, - list(observed, lower, - upper, range, - weigh, - separate_results = separate_results - ) - )] - } - - # compute coverage for every single observation - if ("coverage" %in% metrics) { - res[, coverage := ifelse(observed <= upper & observed >= lower, 1, 0)] # nolint - res[, coverage_deviation := coverage - range / 100] - } - - # compute bias - if ("bias" %in% metrics) { - res[, bias := bias_range( - range = range, lower = lower, upper = upper, - observed = unique(observed) - ), - by = forecast_unit - ] - } - - # compute absolute and squared error for point forecasts - # these are marked by an NA in range, and a numeric value for point - compute_point <- any( - c("se_point, se_mean, ae_point", "ae_median", "absolute_error") %in% metrics - ) - if (compute_point && "point" %in% colnames(res)) { - res[ - is.na(range) & is.numeric(point), - `:=`(ae_point = abs_error(predicted = point, observed), - se_point = squared_error(predicted = point, observed)) - ] - } - - # calculate scores on quantile format ---------------------------------------- - # compute absolute error of the median - if ("ae_median" %in% metrics) { - quantile_data[, ae_median := ae_median_quantile( - observed, - predicted, - quantile - ), - by = forecast_unit - ] - } - - # compute quantile coverage based on quantile version - if ("quantile_coverage" %in% metrics) { - quantile_data[, quantile_coverage := (observed <= predicted)] - } - - # merge metrics computed on quantile data (i.e. ae_median, quantile_coverage) back - # into metrics computed on range data. One important side effect of this is - # that it controls whether we count the median twice for the interval score - # (row is then duplicated) or only once. However, merge only needs to happen - # if we computed either the interval score or the ae_median or quantile coverage - if (any(c("ae_median", "interval_score", "quantile_coverage") %in% metrics)) { - # delete unnecessary columns before merging back - keep_cols <- unique(c( - forecast_unit, "quantile", "ae_median", "quantile_coverage", - "boundary", "range" - )) - delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)] - quantile_data[, eval(delete_cols) := NULL] - - # duplicate median column before merging if median is to be counted twice - # if this is false, then the res will have one entry for every quantile, - # which translates to two rows for every interval, but only one for the median - if (count_median_twice) { - median <- quantile_data[quantile == 0.5, ][, boundary := "upper"] - quantile_data <- data.table::rbindlist(list(quantile_data, median)) - } - - # merge back with other metrics - merge_cols <- setdiff(keep_cols, c( - "ae_median", "quantile_coverage", "quantile", - "boundary" - )) - # specify all.x = TRUE as the point forecasts got deleted when - # going from range to quantile above - res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE) - } - - # delete internal columns before returning result - res <- delete_columns( - res, c("upper", "lower", "boundary", "point", "observed") - ) - - return(res[]) -} diff --git a/R/summarise_scores.R b/R/summarise_scores.R index 03a8883fb..f71a0f7cc 100644 --- a/R/summarise_scores.R +++ b/R/summarise_scores.R @@ -309,78 +309,3 @@ check_summary_params <- function(scores, } return(relative_skill) } - - - -#' @title Add coverage of central prediction intervals -#' -#' @description Adds a column with the coverage of central prediction intervals -#' to unsummarised scores as produced by [score()] -#' -#' The coverage values that are added are computed according to the values -#' specified in `by`. If, for example, `by = "model"`, then there will be one -#' coverage value for every model and [add_coverage()] will compute the coverage -#' for every model across the values present in all other columns which define -#' the unit of a single forecast. -#' -#' @inheritParams summarise_scores -#' @param by character vector with column names to add the coverage for. -#' @param ranges numeric vector of the ranges of the central prediction intervals -#' for which coverage values shall be added. -#' @return a data.table with unsummarised scores with columns added for the -#' coverage of the central prediction intervals. While the overall data.table -#' is still unsummarised, note that for the coverage columns some level of -#' summary is present according to the value specified in `by`. -#' @examples -#' library(magrittr) # pipe operator -#' score(example_quantile) %>% -#' add_coverage(by = c("model", "target_type")) %>% -#' summarise_scores(by = c("model", "target_type")) %>% -#' summarise_scores(fun = signif, digits = 2) -#' @export -#' @keywords scoring - -add_coverage <- function(scores, - by = NULL, - ranges = c(50, 90)) { - - stored_attributes <- get_scoringutils_attributes(scores) - if (!is.null(attr(scores, "unsummarised_scores"))) { - scores <- attr(scores, "unsummarised_scores") - } - - if (is.null(by) && !is.null(stored_attributes[["scoringutils_by"]])) { - by <- stored_attributes[["scoringutils_by"]] - } else if (is.null(by)) { - # Need to check this again. - # (mentioned in https://github.com/epiforecasts/scoringutils/issues/346) - by <- get_forecast_unit(scores) - } - - summarised_scores <- summarise_scores( - scores, - by = c(by, "range") - )[range %in% ranges] - - - # create cast formula - cast_formula <- - paste( - paste(by, collapse = "+"), - "~", - "paste0('coverage_', range)" - ) - - coverages <- dcast( - summarised_scores, - value.var = "coverage", - formula = cast_formula - ) - - scores_with_coverage <- merge(scores, coverages, by = by) - scores_with_coverage <- assign_attributes( - scores_with_coverage, stored_attributes - ) - - return(scores_with_coverage[]) -} diff --git a/R/utils.R b/R/utils.R index c0c0a34e1..53a2d800e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,7 +5,8 @@ #' @keywords info available_metrics <- function() { - return(unique(scoringutils::metrics$Name)) + return(unique(c(scoringutils::metrics$Name, + "wis", "coverage_50", "coverage_90"))) } @@ -200,3 +201,72 @@ remove_scoringutils_class <- function(object) { } return(object) } + +#' @title Run a function safely +#' @description This is a wrapper function designed to run a function safely +#' when it is not completely clear what arguments coulld be passed to the +#' function. +#' +#' All named arguments in `...` that are not accepted by `fun` are removed. +#' All unnamed arguments are passed on to the function. In case `fun` errors, +#' the error will be converted to a warning and `run_safely` returns `NULL`. +#' +#' `run_safely` can be useful when constructing functions to be used as +#' metrics in [score()]. +#' +#' @param ... Arguments to pass to `fun` +#' @param fun A function to execute +#' @return The result of `fun` or `NULL` if `fun` errors +#' @export +#' @examples +#' f <- function(x) {x} +#' run_safely(2, fun = f) +#' run_safely(2, y = 3, fun = f) +#' run_safely(fun = f) +#' run_safely(y = 3, fun = f) +run_safely <- function(..., fun) { + args <- list(...) + # Check if the function accepts ... as an argument + if ("..." %in% names(formals(fun))) { + valid_args <- args + } else if (is.null(names(args))) { + # if no arguments are named, just pass all arguments on + valid_args <- args + } else { + # Identify the arguments that fun() accepts + possible_args <- names(formals(fun)) + # keep valid arguments as well as unnamed arguments + valid_args <- args[names(args) == "" | names(args) %in% possible_args] + } + + result <- try(do.call(fun, valid_args), silent = TRUE) + + if (inherits(result, "try-error")) { + msg <- conditionMessage(attr(result, "condition")) + warning( + "Function execution failed, returning NULL. Error: ", + msg + ) + return(NULL) + } + return(result) +} + + +#' Ensure That an Object is a Data Table +#' @description This function ensures that an object is a data table. +#' If the object is not a data table, it is converted to one. If the object +#' is a data table, a copy of the object is returned. +#' @param data An object to ensure is a data table +#' @return A data table +#' @keywords internal +#' @importFrom data.table copy is.data.table as.data.table +ensure_data.table <- function(data) { + if (!is.data.table(data)) { + data <- as.data.table(data) + } else { + data <- copy(data) + } + return(data) +} + diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R index d0677ca34..1b1302dbf 100644 --- a/R/utils_data_handling.R +++ b/R/utils_data_handling.R @@ -101,8 +101,7 @@ merge_pred_and_obs <- function(forecasts, observations, sample_to_quantile <- function(data, quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95), type = 7) { - data <- data.table::as.data.table(data) - + data <- ensure_data.table(data) reserved_columns <- c("predicted", "sample_id") by <- setdiff(colnames(data), reserved_columns) @@ -204,19 +203,10 @@ quantile_to_interval.data.frame <- function(dt, format = "long", keep_quantile_col = FALSE, ...) { - if (!is.data.table(dt)) { - dt <- data.table::as.data.table(dt) - } else { - # use copy to avoid - dt <- copy(dt) - } + dt <- ensure_data.table(dt) dt[, boundary := ifelse(quantile <= 0.5, "lower", "upper")] - dt[, range := ifelse( - boundary == "lower", - round((1 - 2 * quantile) * 100, 10), - round((2 * quantile - 1) * 100, 10) - )] + dt[, range := get_range_from_quantile(quantile)] # add median quantile median <- dt[quantile == 0.5, ] @@ -230,6 +220,10 @@ quantile_to_interval.data.frame <- function(dt, if (format == "wide") { delete_columns(dt, "quantile") dt <- dcast(dt, ... ~ boundary, value.var = "predicted") + # if there are NA values in `predicted`, this introduces a column "NA" + if ("NA" %in% colnames(dt) && all(is.na(dt[["NA"]]))) { + dt[, "NA" := NULL] + } } return(dt[]) } @@ -307,3 +301,25 @@ sample_to_range_long <- function(data, return(data[]) } + +#' Get Range Belonging to a Quantile +#' @description Every quantile can be thought of either as the lower or the +#' upper bound of a symmetric central prediction interval. This helper function +#' returns the range of the central prediction interval to which the quantile +#' belongs. +#' +#' Due to numeric instability that sometimes occurred in the past, ranges are +#' rounded to 10 decimal places. This is not a problem for the vast majority of +#' use cases, but it is something to be aware of. +#' @param quantile a numeric vector of quantile levels of size N +#' @return a numeric vector of interval ranges of size N +#' @keywords internal +get_range_from_quantile <- function(quantile) { + boundary <- ifelse(quantile <= 0.5, "lower", "upper") + range <- ifelse( + boundary == "lower", + round((1 - 2 * quantile) * 100, digits = 10), + round((2 * quantile - 1) * 100, digits = 10) + ) + return(range) +} diff --git a/R/z_globalVariables.R b/R/z_globalVariables.R index 3f5591611..89cfc2eab 100644 --- a/R/z_globalVariables.R +++ b/R/z_globalVariables.R @@ -30,9 +30,12 @@ globalVariables(c( "identifCol", "Interval_Score", "interval_range", + "interval_coverage", + "interval_coverage_deviation", "overprediction", "underprediction", "quantile_coverage", + "quantile_coverage_deviation", "LogS", "log_score", "lower", @@ -59,6 +62,7 @@ globalVariables(c( "rel_to_baseline", "relative_skill", "rn", + "sample_id", "scoringutils_InternalDuplicateCheck", "scoringutils_InternalNumCheck", "se_mean", @@ -71,6 +75,8 @@ globalVariables(c( "value_scaled", "var_of_interest", "variable", + "weight", + "wis", "wis_component_name", "x", "y", diff --git a/README.Rmd b/README.Rmd index 77584d145..0c4c41223 100644 --- a/README.Rmd +++ b/README.Rmd @@ -91,8 +91,8 @@ Forecasts can be easily and quickly scored using the `score()` function. `score( example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% validate() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( by = c("model", "target_type") ) %>% diff --git a/README.md b/README.md index 2b0e1d1f9..0a5f93881 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,8 @@ details. Finally we summarise these scores by model and target type. example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% validate() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( by = c("model", "target_type") ) %>% @@ -144,15 +144,15 @@ example_quantile %>% kable() ``` -| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill | -|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:| -| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.30 | 1.6 | -| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 2.30 | 3.8 | -| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.82 | 1.0 | -| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.60 | 1.0 | -| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.75 | 1.3 | -| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.95 | 1.2 | -| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.98 | 1.6 | +| model | target_type | wis | overprediction | underprediction | dispersion | bias | coverage_50 | coverage_90 | coverage_deviation | ae_median | relative_skill | scaled_rel_skill | +|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|------------:|------------:|-------------------:|----------:|---------------:|-----------------:| +| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 | +| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 | +| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 | +| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 | +| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 | +| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 | +| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 | `scoringutils` contains additional functionality to transform forecasts, to summarise scores at different levels, to visualise them, and to @@ -174,20 +174,27 @@ example_quantile %>% score %>% summarise_scores(by = c("model", "target_type", "scale")) %>% head() -#> model target_type scale interval_score dispersion -#> 1: EuroCOVIDhub-baseline Cases log 1.169972e+00 0.4373146 -#> 2: EuroCOVIDhub-baseline Cases natural 2.209046e+04 4102.5009443 -#> 3: EuroCOVIDhub-ensemble Cases log 5.500974e-01 0.1011850 -#> 4: EuroCOVIDhub-ensemble Cases natural 1.155071e+04 3663.5245788 -#> 5: epiforecasts-EpiNow2 Cases log 6.005778e-01 0.1066329 -#> 6: epiforecasts-EpiNow2 Cases natural 1.443844e+04 5664.3779484 -#> underprediction overprediction coverage_deviation bias ae_median -#> 1: 3.521964e-01 0.3804607 -0.10940217 0.09726562 1.185905e+00 -#> 2: 1.028497e+04 7702.9836957 -0.10940217 0.09726562 3.208048e+04 -#> 3: 1.356563e-01 0.3132561 -0.09785326 -0.05640625 7.410484e-01 -#> 4: 4.237177e+03 3650.0047554 -0.09785326 -0.05640625 1.770795e+04 -#> 5: 1.858699e-01 0.3080750 -0.06660326 -0.07890625 7.656591e-01 -#> 6: 3.260356e+03 5513.7058424 -0.06660326 -0.07890625 2.153070e+04 +#> model target_type scale wis overprediction +#> 1: EuroCOVIDhub-ensemble Cases natural 11550.70664 3650.004755 +#> 2: EuroCOVIDhub-baseline Cases natural 22090.45747 7702.983696 +#> 3: epiforecasts-EpiNow2 Cases natural 14438.43943 5513.705842 +#> 4: EuroCOVIDhub-ensemble Deaths natural 41.42249 7.138247 +#> 5: EuroCOVIDhub-baseline Deaths natural 159.40387 65.899117 +#> 6: UMass-MechBayes Deaths natural 52.65195 8.978601 +#> underprediction dispersion bias coverage_50 coverage_90 +#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 0.8046875 +#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 0.8203125 +#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 0.7890625 +#> 4: 4.103261 30.18099 0.07265625 0.8750000 1.0000000 +#> 5: 2.098505 91.40625 0.33906250 0.6640625 1.0000000 +#> 6: 16.800951 26.87239 -0.02234375 0.4609375 0.8750000 +#> coverage_deviation ae_median +#> 1: -0.10230114 17707.95312 +#> 2: -0.11437500 32080.48438 +#> 3: -0.06963068 21530.69531 +#> 4: 0.20380682 53.13281 +#> 5: 0.12142045 233.25781 +#> 6: -0.02488636 78.47656 ``` ## Citation diff --git a/data/metrics_quantile.rda b/data/metrics_quantile.rda new file mode 100644 index 000000000..b5598113d Binary files /dev/null and b/data/metrics_quantile.rda differ diff --git a/inst/create-list-available-forecasts.R b/inst/create-list-available-forecasts.R index 0ada6de32..07105a7f4 100644 --- a/inst/create-list-available-forecasts.R +++ b/inst/create-list-available-forecasts.R @@ -22,4 +22,15 @@ metrics_sample <- list( ) usethis::use_data(metrics_sample, overwrite = TRUE) -metrics_quantile <- list() +metrics_quantile <- list( + "wis" = wis, + "overprediction" = overprediction, + "underprediction" = underprediction, + "dispersion" = dispersion, + "bias" = bias_quantile, + "coverage_50" = interval_coverage_quantile, + "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)}, + "coverage_deviation" = interval_coverage_deviation_quantile, + "ae_median" = ae_median_quantile +) +usethis::use_data(metrics_quantile, overwrite = TRUE) diff --git a/man/abs_error.Rd b/man/abs_error.Rd index 099937bfd..197703193 100644 --- a/man/abs_error.Rd +++ b/man/abs_error.Rd @@ -7,10 +7,14 @@ abs_error(observed, predicted) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} } \value{ vector with the absolute error diff --git a/man/add_coverage.Rd b/man/add_coverage.Rd index 33990a3bc..d6c82d467 100644 --- a/man/add_coverage.Rd +++ b/man/add_coverage.Rd @@ -1,40 +1,56 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarise_scores.R +% Please edit documentation in R/add_coverage.R \name{add_coverage} \alias{add_coverage} -\title{Add coverage of central prediction intervals} +\title{Add Coverage Values to Quantile-Based Forecasts} \usage{ -add_coverage(scores, by = NULL, ranges = c(50, 90)) +add_coverage(data) } \arguments{ -\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} - -\item{by}{character vector with column names to add the coverage for.} - -\item{ranges}{numeric vector of the ranges of the central prediction intervals -for which coverage values shall be added.} +\item{data}{A data.frame or data.table with predicted and observed values.} } \value{ -a data.table with unsummarised scores with columns added for the -coverage of the central prediction intervals. While the overall data.table -is still unsummarised, note that for the coverage columns some level of -summary is present according to the value specified in \code{by}. +a data.table with the input and columns "interval_coverage", +"interval_coverage_deviation", "quantile_coverage", +"quantile_coverage_deviation" added. } \description{ -Adds a column with the coverage of central prediction intervals -to unsummarised scores as produced by \code{\link[=score]{score()}} +Adds interval coverage of central prediction intervals, +quantile coverage for predictive quantiles, as well as the deviation between +desired and actual coverage to a data.table. Forecasts should be in a +quantile format (following the input requirements of \code{score()}). + +\strong{Interval coverage} + +Coverage for a given interval range is defined as the proportion of +observations that fall within the corresponding central prediction intervals. +Central prediction intervals are symmetric around the median and and formed +by two quantiles that denote the lower and upper bound. For example, the 50\% +central prediction interval is the interval between the 0.25 and 0.75 +quantiles of the predictive distribution. + +The function \code{add_coverage()} computes the coverage per central prediction +interval, so the coverage will always be either \code{TRUE} (observed value falls +within the interval) or \code{FALSE} (observed value falls outside the interval). +You can summarise the coverage values to get the proportion of observations +that fall within the central prediction intervals. + +\strong{Quantile coverage} + +Quantile coverage for a given quantile is defined as the proportion of +observed values that are smaller than the corresponding predictive quantile. +For example, the 0.5 quantile coverage is the proportion of observed values +that are smaller than the 0.5 quantile of the predictive distribution. + +\strong{Coverage deviation} -The coverage values that are added are computed according to the values -specified in \code{by}. If, for example, \code{by = "model"}, then there will be one -coverage value for every model and \code{\link[=add_coverage]{add_coverage()}} will compute the coverage -for every model across the values present in all other columns which define -the unit of a single forecast. +The coverage deviation is the difference between the desired coverage and the +actual coverage. For example, if the desired coverage is 90\% and the actual +coverage is 80\%, the coverage deviation is -0.1. } \examples{ library(magrittr) # pipe operator -score(example_quantile) \%>\% - add_coverage(by = c("model", "target_type")) \%>\% - summarise_scores(by = c("model", "target_type")) \%>\% - summarise_scores(fun = signif, digits = 2) +example_quantile \%>\% + add_coverage() } \keyword{scoring} diff --git a/man/ae_median_quantile.Rd b/man/ae_median_quantile.Rd index 841850235..d96965c7c 100644 --- a/man/ae_median_quantile.Rd +++ b/man/ae_median_quantile.Rd @@ -4,38 +4,38 @@ \alias{ae_median_quantile} \title{Absolute Error of the Median (Quantile-based Version)} \usage{ -ae_median_quantile(observed, predicted, quantiles = NULL) +ae_median_quantile(observed, predicted, quantile) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} -\item{quantiles}{numeric vector that denotes the quantile for the values -in \code{predicted}. Only those predictions where \code{quantiles == 0.5} will -be kept. If \code{quantiles} is \code{NULL}, then all \code{predicted} and -\code{observed} will be used (this is then the same as \code{\link[=abs_error]{abs_error()}})} +\item{quantile}{vector with quantile levels of size N} } \value{ -vector with the scoring values +numeric vector of length N with the absolute error of the median } \description{ -Absolute error of the median calculated as - +Compute the absolute error of the median calculated as \deqn{ - \textrm{abs}(\textrm{observed} - \textrm{prediction}) + \textrm{abs}(\textrm{observed} - \textrm{median prediction}) }{ abs(observed - median_prediction) } - -The function was created for internal use within \code{\link[=score]{score()}}, but can also -used as a standalone function. +The median prediction is the predicted value for which quantile == 0.5, +the function therefore requires 0.5 to be among the quantile levels in +\code{quantile}. } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -ae_median_quantile(observed, predicted_values, quantiles = 0.5) +predicted_values <- matrix(rnorm(30, mean = 1:30)) +ae_median_quantile(observed, predicted_values, quantile = 0.5) } \seealso{ \code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=abs_error]{abs_error()}} diff --git a/man/ae_median_sample.Rd b/man/ae_median_sample.Rd index d446b3300..1e420fcc4 100644 --- a/man/ae_median_sample.Rd +++ b/man/ae_median_sample.Rd @@ -27,7 +27,7 @@ Absolute error of the median calculated as } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) ae_median_sample(observed, predicted_values) } \seealso{ diff --git a/man/assert_input_interval.Rd b/man/assert_input_interval.Rd new file mode 100644 index 000000000..b8d2d93d0 --- /dev/null +++ b/man/assert_input_interval.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_interval} +\alias{assert_input_interval} +\title{Assert that inputs are correct for interval-based forecast} +\usage{ +assert_input_interval(observed, lower, upper, range) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{lower}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the lower bounds of the prediction intervals.} + +\item{upper}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the upper bounds of the prediction intervals.} + +\item{range}{Input to be checked. Should be a vector of size n that +denotes the interval range in percent. E.g. a value of 50 denotes a +(25\%, 75\%) prediction interval.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring interval-based forecasts. +} +\keyword{internal} diff --git a/man/assert_input_quantile.Rd b/man/assert_input_quantile.Rd index 87247c0ff..c9a499136 100644 --- a/man/assert_input_quantile.Rd +++ b/man/assert_input_quantile.Rd @@ -4,7 +4,7 @@ \alias{assert_input_quantile} \title{Assert that inputs are correct for quantile-based forecast} \usage{ -assert_input_quantile(observed, predicted, quantile) +assert_input_quantile(observed, predicted, quantile, unique_quantiles = TRUE) } \arguments{ \item{observed}{Input to be checked. Should be a numeric vector with the @@ -19,6 +19,9 @@ vector of size N.} \item{quantile}{Input to be checked. Should be a vector of size N that denotes the quantile levels corresponding to the columns of the prediction matrix.} + +\item{unique_quantiles}{Input to be checked. Should be TRUE (default) or +FALSE. Whether the quantile levels are required to be unique or not.} } \value{ Returns NULL invisibly if the assertion was successful and throws an diff --git a/man/bias_quantile.Rd b/man/bias_quantile.Rd index 4cbeb0338..a5f4b6492 100644 --- a/man/bias_quantile.Rd +++ b/man/bias_quantile.Rd @@ -4,7 +4,7 @@ \alias{bias_quantile} \title{Determines Bias of Quantile Forecasts} \usage{ -bias_quantile(observed, predicted, quantile) +bias_quantile(observed, predicted, quantile, na.rm = TRUE) } \arguments{ \item{observed}{a single observed value} @@ -15,6 +15,8 @@ that holds predictions} \item{quantile}{vector of corresponding size with the quantile levels for which predictions were made. If this does not contain the median (0.5) then the median is imputed as being the mean of the two innermost quantiles.} + +\item{na.rm}{logical. Should missing values be removed?} } \value{ scalar with the quantile bias for a single quantile prediction diff --git a/man/bias_quantile_single_vector.Rd b/man/bias_quantile_single_vector.Rd new file mode 100644 index 000000000..26ac893b5 --- /dev/null +++ b/man/bias_quantile_single_vector.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{bias_quantile_single_vector} +\alias{bias_quantile_single_vector} +\title{Compute Bias for a Single Vector of Quantile Predictions} +\usage{ +bias_quantile_single_vector(observed, predicted, quantile, na.rm) +} +\arguments{ +\item{observed}{scalar with the observed value} + +\item{predicted}{vector of length N corresponding to the number of quantiles +that holds predictions} + +\item{quantile}{vector of corresponding size N with the quantile levels for +which predictions were made. If this does not contain the median (0.5) then +the median is imputed as being the mean of the two innermost quantiles.} + +\item{na.rm}{logical. Should missing values be removed?} +} +\value{ +scalar with the quantile bias for a single quantile prediction +} +\description{ +Internal function to compute bias for a single observed value, +a vector of predicted values and a vector of quantiles. +} diff --git a/man/check_input_interval.Rd b/man/check_input_interval.Rd new file mode 100644 index 000000000..faa9e4db5 --- /dev/null +++ b/man/check_input_interval.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_interval} +\alias{check_input_interval} +\title{Check that inputs are correct for interval-based forecast} +\usage{ +check_input_interval(observed, lower, upper, range) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{lower}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the lower bounds of the prediction intervals.} + +\item{upper}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the upper bounds of the prediction intervals.} + +\item{range}{Input to be checked. Should be a vector of size n that +denotes the interval range in percent. E.g. a value of 50 denotes a +(25\%, 75\%) prediction interval.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring interval-based forecasts. +} +\keyword{check-inputs} diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd index 39780292e..1d4686a1c 100644 --- a/man/compare_two_models.Rd +++ b/man/compare_two_models.Rd @@ -22,7 +22,7 @@ compare_two_models( \item{name_model2}{character, name of the model to compare against} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided diff --git a/man/ensure_data.table.Rd b/man/ensure_data.table.Rd new file mode 100644 index 000000000..6d2457ee5 --- /dev/null +++ b/man/ensure_data.table.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{ensure_data.table} +\alias{ensure_data.table} +\title{Ensure That an Object is a Data Table} +\usage{ +ensure_data.table(data) +} +\arguments{ +\item{data}{An object to ensure is a data table} +} +\value{ +A data table +} +\description{ +This function ensures that an object is a data table. +If the object is not a data table, it is converted to one. If the object +is a data table, a copy of the object is returned. +} +\keyword{internal} diff --git a/man/example_binary.Rd b/man/example_binary.Rd index e7042d6b2..47797b8cd 100644 --- a/man/example_binary.Rd +++ b/man/example_binary.Rd @@ -19,7 +19,7 @@ A data frame with 346 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_binary diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd index d1fba390e..354ebc5d6 100644 --- a/man/example_continuous.Rd +++ b/man/example_continuous.Rd @@ -20,7 +20,7 @@ A data frame with 13,429 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_continuous diff --git a/man/example_point.Rd b/man/example_point.Rd index 62af0e44f..1eb734b76 100644 --- a/man/example_point.Rd +++ b/man/example_point.Rd @@ -19,7 +19,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_point diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd index 00250e6d0..2582907e9 100644 --- a/man/example_quantile.Rd +++ b/man/example_quantile.Rd @@ -20,7 +20,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd index 3fcaf2722..d789ed1e0 100644 --- a/man/example_quantile_forecasts_only.Rd +++ b/man/example_quantile_forecasts_only.Rd @@ -18,7 +18,7 @@ A data frame with 7,581 rows and 9 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile_forecasts_only diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd index 46453ba97..f8ae05afa 100644 --- a/man/example_truth_only.Rd +++ b/man/example_truth_only.Rd @@ -15,7 +15,7 @@ A data frame with 140 rows and 5 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_truth_only diff --git a/man/get_range_from_quantile.Rd b/man/get_range_from_quantile.Rd new file mode 100644 index 000000000..eca15af36 --- /dev/null +++ b/man/get_range_from_quantile.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_data_handling.R +\name{get_range_from_quantile} +\alias{get_range_from_quantile} +\title{Get Range Belonging to a Quantile} +\usage{ +get_range_from_quantile(quantile) +} +\arguments{ +\item{quantile}{a numeric vector of quantile levels of size N} +} +\value{ +a numeric vector of interval ranges of size N +} +\description{ +Every quantile can be thought of either as the lower or the +upper bound of a symmetric central prediction interval. This helper function +returns the range of the central prediction interval to which the quantile +belongs. + +Due to numeric instability that sometimes occurred in the past, ranges are +rounded to 10 decimal places. This is not a problem for the vast majority of +use cases, but it is something to be aware of. +} +\keyword{internal} diff --git a/man/interval_coverage.Rd b/man/interval_coverage.Rd new file mode 100644 index 000000000..74256cc77 --- /dev/null +++ b/man/interval_coverage.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R, R/metrics-sample.R +\name{interval_coverage} +\alias{interval_coverage} +\alias{interval_coverage_quantile} +\alias{interval_coverage_sample} +\title{Interval Coverage (For Quantile-Based Forecasts)} +\usage{ +interval_coverage_quantile(observed, predicted, quantile, range = 50) + +interval_coverage_sample(observed, predicted, range = 50) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} + +\item{range}{A single number with the range of the prediction interval in +percent (e.g. 50 for a 50\% prediction interval) for which you want to compute +coverage.} +} +\value{ +A vector of length n with TRUE if the observed value is within the +corresponding prediction interval and FALSE otherwise. +} +\description{ +Check whether the observed value is within a given central +prediction interval. The prediction interval is defined by a lower and an +upper bound formed by a pair of predictive quantiles. For example, a 50\% +prediction interval is formed by the 0.25 and 0.75 quantiles of the +predictive distribution. + +To compute coverage for sample-based forecasts, +predictive samples are converted first into predictive quantiles using the +sample quantiles. +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +interval_coverage_quantile(observed, predicted, quantile) +interval_coverage_sample(observed, predicted) +} diff --git a/man/interval_coverage_deviation_quantile.Rd b/man/interval_coverage_deviation_quantile.Rd new file mode 100644 index 000000000..9a6029ec7 --- /dev/null +++ b/man/interval_coverage_deviation_quantile.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{interval_coverage_deviation_quantile} +\alias{interval_coverage_deviation_quantile} +\title{Interval Coverage Deviation (For Quantile-Based Forecasts)} +\usage{ +interval_coverage_deviation_quantile(observed, predicted, quantile) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} +} +\value{ +A numeric vector of length n with the coverage deviation for each +forecast (comprising one or multiple prediction intervals). +} +\description{ +Check the agreement between desired and actual interval coverage +of a forecast. + +The function is similar to \code{\link[=interval_coverage_quantile]{interval_coverage_quantile()}}, +but looks at all provided prediction intervals instead of only one. It +compares nominal coverage (i.e. the desired coverage) with the actual +observed coverage. + +A central symmetric prediction interval is defined by a lower and an +upper bound formed by a pair of predictive quantiles. For example, a 50\% +prediction interval is formed by the 0.25 and 0.75 quantiles of the +predictive distribution. Ideally, a forecaster should aim to cover about +50\% of all observed values with their 50\% prediction intervals, 90\% of all +observed values with their 90\% prediction intervals, and so on. + +For every prediction interval, the deviation is computed as the difference +between the observed coverage and the nominal coverage +For a single observed value and a single prediction interval, +coverage is always either 0 or 1. This is not the case for a single observed +value and multiple prediction intervals, but it still doesn't make that much +sense to compare nominal (desired) coverage and actual coverage for a single +observation. In that sense coverage deviation only really starts to make +sense as a metric when averaged across multiple observations). + +Positive values of coverage deviation are an indication for underconfidence, +i.e. the forecaster could likely have issued a narrower forecast. Negative +values are an indication for overconfidence, i.e. the forecasts were too +narrow. + +\deqn{ +\textrm{coverage deviation} = +\mathbf{1}(\textrm{observed value falls within interval} - +\textrm{nominal coverage}) +}{ +coverage deviation = +1(observed value falls within interval) - nominal coverage +} +The coverage deviation is then averaged across all prediction intervals. +The median is ignored when computing coverage deviation. +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +interval_coverage_deviation_quantile(observed, predicted, quantile) +} diff --git a/man/interval_score.Rd b/man/interval_score.Rd index 9232fde21..9c0f4c567 100644 --- a/man/interval_score.Rd +++ b/man/interval_score.Rd @@ -80,7 +80,7 @@ observed <- rnorm(30, mean = 1:30) interval_range <- rep(90, 30) alpha <- (100 - interval_range) / 100 lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) +upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 11:40)) interval_score( observed = observed, @@ -90,7 +90,7 @@ interval_score( ) # gives a warning, as the interval_range should likely be 50 instead of 0.5 -interval_score(observed = 4, upper = 2, lower = 8, interval_range = 0.5) +interval_score(observed = 4, upper = 8, lower = 2, interval_range = 0.5) # example with missing values and separate results interval_score( diff --git a/man/metrics_quantile.Rd b/man/metrics_quantile.Rd new file mode 100644 index 000000000..ea444ee7e --- /dev/null +++ b/man/metrics_quantile.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{metrics_quantile} +\alias{metrics_quantile} +\title{Default metrics for quantile-based forecasts.} +\format{ +An object of class \code{list} of length 9. +} +\usage{ +metrics_quantile +} +\description{ +A named list with functions: +\itemize{ +\item "wis" = \code{\link[=wis]{wis()}} +\item "overprediction" = \code{\link[=overprediction]{overprediction()}} +\item "underprediction" = \code{\link[=underprediction]{underprediction()}} +\item "dispersion" = \code{\link[=dispersion]{dispersion()}} +\item "bias" = \code{\link[=bias_quantile]{bias_quantile()}} +\item "coverage_50" = \(...) {run_safely(..., range = 50, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})} +\item "coverage_90" = \(...) {run_safely(..., range = 90, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})} +\item "coverage_deviation" = \code{\link[=interval_coverage_deviation_quantile]{interval_coverage_deviation_quantile()}}, +\item "ae_median" = \code{\link[=ae_median_quantile]{ae_median_quantile()}} +} +} +\keyword{info} diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd index 9288e77fb..d30be1197 100644 --- a/man/pairwise_comparison.Rd +++ b/man/pairwise_comparison.Rd @@ -25,7 +25,7 @@ splitting) and the pairwise comparisons will be computed separately for the split data.frames.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd index a7d902f15..df59b1472 100644 --- a/man/pairwise_comparison_one_group.Rd +++ b/man/pairwise_comparison_one_group.Rd @@ -10,7 +10,7 @@ pairwise_comparison_one_group(scores, metric, baseline, by, ...) \item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 8b1aac549..837ef4243 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -29,7 +29,7 @@ different locations. } \examples{ scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +scores <- summarise_scores(scores, by = c("model", "target_type")) plot_heatmap(scores, x = "target_type", metric = "bias") } diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd index 9c7da16fe..6c4c3e985 100644 --- a/man/plot_interval_coverage.Rd +++ b/man/plot_interval_coverage.Rd @@ -21,8 +21,8 @@ ggplot object with a plot of interval coverage Plot interval coverage } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "range")) -plot_interval_coverage(scores) +# data.table::setDTthreads(1) # only needed to avoid issues on CRAN +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "range")) +# plot_interval_coverage(scores) } diff --git a/man/plot_quantile_coverage.Rd b/man/plot_quantile_coverage.Rd index 2e6ef489e..c479fb5e3 100644 --- a/man/plot_quantile_coverage.Rd +++ b/man/plot_quantile_coverage.Rd @@ -21,7 +21,7 @@ ggplot object with a plot of interval coverage Plot quantile coverage } \examples{ -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "quantile")) -plot_quantile_coverage(scores) +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "quantile")) +# plot_quantile_coverage(scores) } diff --git a/man/plot_ranges.Rd b/man/plot_ranges.Rd index a4a999ff2..27922b3c0 100644 --- a/man/plot_ranges.Rd +++ b/man/plot_ranges.Rd @@ -4,7 +4,7 @@ \alias{plot_ranges} \title{Plot Metrics by Range of the Prediction Interval} \usage{ -plot_ranges(scores, y = "interval_score", x = "model", colour = "range") +plot_ranges(scores, y = "wis", x = "model", colour = "range") } \arguments{ \item{scores}{A data.frame of scores based on quantile forecasts as @@ -12,7 +12,7 @@ produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summ in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} \item{y}{The variable from the scores you want to show on the y-Axis. -This could be something like "interval_score" (the default) or "dispersion"} +This could be something like "wis" (the default) or "dispersion"} \item{x}{The variable from the scores you want to show on the x-Axis. Usually this will be "model"} @@ -31,13 +31,13 @@ sharpness / dispersion changes by range. } \examples{ library(ggplot2) -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "target_type", "range")) -plot_ranges(scores, x = "model") + - facet_wrap(~target_type, scales = "free") +# plot_ranges(scores, x = "model") + +# facet_wrap(~target_type, scales = "free") # visualise dispersion instead of interval score -plot_ranges(scores, y = "dispersion", x = "model") + - facet_wrap(~target_type) +# plot_ranges(scores, y = "dispersion", x = "model") + +# facet_wrap(~target_type) } diff --git a/man/run_safely.Rd b/man/run_safely.Rd new file mode 100644 index 000000000..9e673b061 --- /dev/null +++ b/man/run_safely.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{run_safely} +\alias{run_safely} +\title{Run a function safely} +\usage{ +run_safely(..., fun) +} +\arguments{ +\item{...}{Arguments to pass to \code{fun}} + +\item{fun}{A function to execute} +} +\value{ +The result of \code{fun} or \code{NULL} if \code{fun} errors +} +\description{ +This is a wrapper function designed to run a function safely +when it is not completely clear what arguments coulld be passed to the +function. + +All named arguments in \code{...} that are not accepted by \code{fun} are removed. +All unnamed arguments are passed on to the function. In case \code{fun} errors, +the error will be converted to a warning and \code{run_safely} returns \code{NULL}. + +\code{run_safely} can be useful when constructing functions to be used as +metrics in \code{\link[=score]{score()}}. +} +\examples{ +f <- function(x) {x} +run_safely(2, fun = f) +run_safely(2, y = 3, fun = f) +run_safely(fun = f) +run_safely(y = 3, fun = f) +} diff --git a/man/score.Rd b/man/score.Rd index bf239ba7e..c26fbb24c 100644 --- a/man/score.Rd +++ b/man/score.Rd @@ -19,7 +19,7 @@ score(data, ...) \method{score}{scoringutils_sample}(data, metrics = metrics_sample, ...) -\method{score}{scoringutils_quantile}(data, metrics = NULL, ...) +\method{score}{scoringutils_quantile}(data, metrics = metrics_quantile, ...) } \arguments{ \item{data}{A data.frame or data.table with predicted and observed values.} diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd deleted file mode 100644 index 5f51f94ec..000000000 --- a/man/score_quantile.Rd +++ /dev/null @@ -1,62 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/score_quantile.R -\name{score_quantile} -\alias{score_quantile} -\title{Evaluate forecasts in a Quantile-Based Format} -\usage{ -score_quantile( - data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE -) -} -\arguments{ -\item{data}{A data.frame or data.table with predicted and observed values.} - -\item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast, i.e. a forecast was made for a combination -of the values in \code{forecast_unit}} - -\item{metrics}{A named list of scoring functions. Names will be used as -column names in the output. See \code{\link[=metrics_point]{metrics_point()}}, \code{\link[=metrics_binary]{metrics_binary()}}, -\code{metrics_quantile()}, and \code{\link[=metrics_sample]{metrics_sample()}} for more information on the -default metrics used.} - -\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged -into an interval score that, in the limit, corresponds to CRPS. Alpha is the -decimal value that represents how much is outside a central prediction -interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) -Default: \code{TRUE}.} - -\item{count_median_twice}{logical that controls whether or not to count the -median twice when summarising (default is \code{FALSE}). Counting the -median twice would conceptually treat it as a 0\\% prediction interval, where -the median is the lower as well as the upper bound. The alternative is to -treat the median as a single quantile forecast instead of an interval. The -interval score would then be better understood as an average of quantile -scores.} - -\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate -parts of the interval score (dispersion penalty, penalties for over- and -under-prediction get returned as separate elements of a list). If you want a -\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=score]{score()}} -} -\description{ -Evaluate forecasts in a Quantile-Based Format -} -\references{ -Bosse NI, Gruson H, Cori A, van Leeuwen E, Funk S, Abbott S -(2022) Evaluating Forecasts with scoringutils in R. -\doi{10.48550/arXiv.2205.07090} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{internal} diff --git a/man/se_mean_sample.Rd b/man/se_mean_sample.Rd index d7c7d332f..c2101d4df 100644 --- a/man/se_mean_sample.Rd +++ b/man/se_mean_sample.Rd @@ -20,14 +20,15 @@ vector with the scoring values Squared error of the mean calculated as \deqn{ - \textrm{mean}(\textrm{observed} - \textrm{prediction})^2 + \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 }{ - mean(observed - mean_prediction)^2 + mean(observed - mean prediction)^2 } +The mean prediction is calculated as the mean of the predictive samples. } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) se_mean_sample(observed, predicted_values) } \seealso{ diff --git a/man/wis.Rd b/man/wis.Rd new file mode 100644 index 000000000..65f099757 --- /dev/null +++ b/man/wis.Rd @@ -0,0 +1,143 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{wis} +\alias{wis} +\alias{dispersion} +\alias{overprediction} +\alias{underprediction} +\title{Weighted Interval Score (WIS)} +\usage{ +wis( + observed, + predicted, + quantile, + separate_results = FALSE, + weigh = TRUE, + count_median_twice = FALSE, + na.rm = TRUE +) + +dispersion(observed, predicted, quantile, ...) + +overprediction(observed, predicted, quantile, ...) + +underprediction(observed, predicted, quantile, ...) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} + +\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate +parts of the interval score (dispersion penalty, penalties for over- and +under-prediction get returned as separate elements of a list). If you want a +\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} + +\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged +into an interval score that, in the limit, corresponds to CRPS. Alpha is the +decimal value that represents how much is outside a central prediction +interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +Default: \code{TRUE}.} + +\item{count_median_twice}{if TRUE, count the median twice in the score} + +\item{na.rm}{if TRUE, ignore NA values when computing the score} + +\item{...}{Additional arguments passed on to \code{wis()} from functions +\code{overprediction()}, \code{underprediction()} and \code{dispersion()}} +} +\value{ +\code{wis()}: a numeric vector with WIS values of size n (one per observation), +or a list with separate entries if \code{separate_results} is \code{TRUE}. + +\code{dispersion()}: a numeric vector with dispersion values (one per observation) + +\code{overprediction()}: a numeric vector with overprediction values (one per +observation) + +\code{underprediction()}: a numeric vector with underprediction values (one per +observation) +} +\description{ +The WIS is a proper scoring rule used to evaluate forecasts in an interval- / +quantile-based format. See Bracher et al. (2021). Smaller values are better. + +As the name suggest the score assumes that a forecast comes in the form of +one or multiple central prediction intervals. A prediction interval is +characterised by a lower and an upper bound formed by a pair of predictive +quantiles. For example, a 50\% central prediction interval is formed by the +0.25 and 0.75 quantiles of the predictive distribution. + +\strong{Interval score} + +The interval score (IS) is the sum of three components: +overprediction, underprediction and dispersion. For a single prediction +interval only one of the components is non-zero. If for a single prediction +interval the observed value is below the lower bound, then the interval +score is equal to the absolute difference between the lower bound and the +observed value ("underprediction"). "Overprediction" is defined analogously. +If the observed value falls within the bounds of the prediction interval, +then the interval score is equal to the width of the prediction interval, +i.e. the difference between the upper and lower bound. For a single interval, +we therefore have: + +\deqn{ +\textrm{IS} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} + - \textrm{observed}) * +\mathbf{1}(\textrm{observed} < \textrm{lower}) + +\frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +\mathbf{1}(\textrm{observed} > \textrm{upper}) +}{ +score = (upper - lower) + 2/alpha * (lower - observed) * +1(observed < lower) + 2/alpha * (observed - upper) * +1(observed > upper) +} +where \eqn{\mathbf{1}()}{1()} is the indicator function and +indicates how much is outside the prediction interval. +\eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +the prediction interval. For a 90\% prediction interval, for example, +\eqn{\alpha}{alpha} is equal to 0.1. No specific distribution is assumed, +but the range has to be symmetric (i.e you can't use the 0.1 quantile +as the lower bound and the 0.7 quantile as the upper). +Non-symmetric quantiles can be scored using the function \code{\link[=quantile_score]{quantile_score()}}. + +Usually the interval score is weighted by a factor that makes sure that the +average score across an increasing number of equally spaced +quantiles, converges to the continuous ranked probability score (CRPS). This +weighted score is called the weihted interval score (WIS). +The weight commonly used is \eqn{\alpha / 2}{alpha / 2}. + +\strong{Quantile score} + +In addition to the interval score, there also exists a quantile score (QS) +(see \code{\link[=quantile_score]{quantile_score()}}), which is equal to the so-called pinball loss. +The quantile score can be computed for a single quantile (whereas the +interval score requires two quantiles that form an interval). However, +the intuitive decomposition into overprediction, underprediction and +dispersion does not exist for the quantile score. + +\strong{Two versions of the weighted interval score} + +There are two ways to conceptualise the weighted interval score across +several quantiles / prediction intervals and the median. + +In one view, you would treat the WIS as the average of quantile scores (and +the median as 0.5-quantile) (this is the default for \code{wis()}). In another +view, you would treat the WIS as the average of several interval scores + +the difference between observed value and median forecast. The effect of +that is that in contrast to the first view, the median has twice as much +weight (because it is weighted like a prediction interval, rather than like +a single quantile). Both are valid ways to conceptualise the WIS and you +can control the behvaviour with the \code{count_median_twice}-argument. + +\strong{WIS components}: +WIS components can be computed individually using the functions +\code{overprediction}, \code{underprediction}, and \code{dispersion.} +} diff --git a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg index f56619980..a33e8a207 100644 --- a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg +++ b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg @@ -25,104 +25,146 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -1 -0.46 -1 -0.28 -0.15 -1 -0.94 -0.32 --0.03 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1 +0.94 +1 +0.28 +-0.03 +1 +0.46 +0.32 +0.15 +1 +0.11 +0.22 +-0.35 +0.11 1 --0.34 --0.12 --0.33 --0.25 -1 -0.11 -0.11 --0.35 -0.22 -0.06 -1 -0.99 -0.54 -0.34 -0.9 --0.38 -0.1 -1 +-0.21 +-0.15 +-0.21 +-0.09 +0.01 +1 +-0.41 +-0.32 +-0.36 +-0.09 +0.1 +0.37 +1 +-0.34 +-0.25 +-0.33 +-0.12 +0.06 +0.85 +0.64 +1 +0.99 +0.9 +0.34 +0.54 +0.1 +-0.25 +-0.41 +-0.38 +1 -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median - - - +wis +overprediction +underprediction +dispersion +bias +coverage_50 +coverage_90 +coverage_deviation +ae_median + + + + - - - + + + + - - - + + + + - - - -ae_median -bias -coverage_deviation -overprediction -underprediction -dispersion -interval_score - -0.0 -0.5 + + + + +ae_median +coverage_deviation +coverage_90 +coverage_50 +bias +dispersion +underprediction +overprediction +wis + +0.0 +0.5 1.0 Correlation - - + + - - + + plot__correlation diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg index 528aefb09..8c4222d9a 100644 --- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg +++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg @@ -25,174 +25,20 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 -0.06 --0.06 --0.06 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 +0.1 -0.08 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.07 -0.07 -0.07 -0.07 0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 +0.34 -0.02 -0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 diff --git a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg index 548878c34..91848b1dd 100644 --- a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg +++ b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg @@ -57,15 +57,17 @@ 100 Nominal interval coverage % Obs inside interval -model - - - - -EuroCOVIDhub-baseline -EuroCOVIDhub-ensemble -UMass-MechBayes -epiforecasts-EpiNow2 +model + + + + + +EuroCOVIDhub-baseline +EuroCOVIDhub-ensemble +UMass-MechBayes +epiforecasts-EpiNow2 +NA plot_interval_coverage diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg index 22e2408dc..458487011 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ < 0.001 < 0.001 1 -0.298 < 0.001 +0.298 1 0.298 < 0.001 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ < 0.001 < 0.001 1 +< 0.001 < 0.001 < 0.001 -< 0.001 1 0.007 < 0.001 diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg index 1ff397cfe..3a7599da4 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ 1.37 1.59 1 -0.86 0.63 +0.86 1 1.16 0.73 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ 3.03 3.85 1 +0.26 0.62 0.79 -0.26 1 0.74 1.27 diff --git a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg index bf686eedb..76808cc67 100644 --- a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg +++ b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg @@ -57,15 +57,17 @@ 1.00 Quantile % Obs below quantile -model - - - - -EuroCOVIDhub-baseline -EuroCOVIDhub-ensemble -UMass-MechBayes -epiforecasts-EpiNow2 +model + + + + + +EuroCOVIDhub-baseline +EuroCOVIDhub-ensemble +UMass-MechBayes +epiforecasts-EpiNow2 +NA plot_quantile_coverage diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg index 812e1600f..4ad667f8a 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg index 15c9ee3e3..98a9a883c 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -203,7 +203,7 @@ model -interval_score +wis 0 25 diff --git a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg index 95f9fe247..25b15d296 100644 --- a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg +++ b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg @@ -25,62 +25,78 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -10000 -9000 -50 -10000 -2000 -2000 -30 -3000 -5000 -2000 -20 -2000 -7000 -5000 -9 -6000 -0.002 -0.05 --0.02 --0.06 -0.2 -0.008 --0.02 --0.04 -20000 -10000 -80 -10000 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +9000 +10000 +10000 +50 +5000 +7000 +6000 +9 +2000 +5000 +2000 +20 +2000 +2000 +3000 +30 +0.008 +0.2 +-0.04 +-0.02 +0.6 +0.5 +0.4 +0.5 +0.9 +0.9 +0.8 +0.9 +0.05 +0.002 +-0.06 +-0.02 +10000 +20000 +10000 +80 @@ -93,20 +109,24 @@ - - - + + + + - - - -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median + + + + +wis +overprediction +underprediction +dispersion +bias +coverage_50 +coverage_90 +coverage_deviation +ae_median plot_score_table diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg index 758a3c147..c315cf06d 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg index 987072ca4..fea309214 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg index 5328b4779..a2bdf8653 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index c157fb958..a236f299d 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,8 +1,13 @@ # load common required test packages library(ggplot2, quietly = TRUE) +library(data.table) suppressMessages(library(magrittr)) -# compute quantile scores +metrics_no_cov <- metrics_quantile[!grepl("coverage", names(metrics_quantile))] +metrics_no_cov_no_ae <- metrics_no_cov[!grepl("ae", names(metrics_no_cov))] + + +# compute scores scores_quantile <- suppressMessages(score(example_quantile)) scores_continuous <- suppressMessages(score(data = example_continuous)) scores_point <- suppressMessages(score(example_point)) diff --git a/tests/testthat/test-add_coverage.R b/tests/testthat/test-add_coverage.R index fab1e72a1..689b8640b 100644 --- a/tests/testthat/test-add_coverage.R +++ b/tests/testthat/test-add_coverage.R @@ -1,31 +1,13 @@ -ex_coverage <- scores_quantile[model == "EuroCOVIDhub-ensemble"] +ex_coverage <- example_quantile[model == "EuroCOVIDhub-ensemble"] test_that("add_coverage() works as expected", { - expect_error( - add_coverage(ex_coverage, by = c("model", "target_type"), range = c()) - ) - expect_error( - add_coverage(ex_coverage, by = c("model", "target_type")), NA - ) - cov <- add_coverage( - scores_quantile, by = c("model", "target_type"), range = c(10, 20) - ) - expect_equal( - grep("coverage_", colnames(cov), value = TRUE), - c("coverage_deviation", "coverage_10", "coverage_20") - ) -}) - + expect_no_condition(cov <- add_coverage(example_quantile)) -test_that("Order of `add_coverage()` and `summarise_scores()` doesn't matter", { - # Need to update test. Turns out the order does matter... - # see https://github.com/epiforecasts/scoringutils/issues/367 - pw1 <- add_coverage(ex_coverage, by = "model") - pw1_sum <- summarise_scores(pw1, by = "model") - - pw2 <- summarise_scores(ex_coverage, by = "model") - pw2 <- add_coverage(pw2) + required_names <- c( + "range", "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation" + ) + expect_equal(colnames(cov), c(colnames(example_quantile), required_names)) - # expect_true(all(pw1_sum == pw2, na.rm = TRUE)) - # expect_true(all(names(attributes(pw2)) == names(attributes(pw1_sum)))) + expect_equal(nrow(cov), nrow(example_quantile)) }) diff --git a/tests/testthat/test-bias.R b/tests/testthat/test-bias.R deleted file mode 100644 index a8444707d..000000000 --- a/tests/testthat/test-bias.R +++ /dev/null @@ -1,244 +0,0 @@ -test_that("bias_sample() throws an error when missing observed", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(predicted = predicted), - 'argument "observed" is missing, with no default' - ) -}) - -test_that("bias_sample() throws an error when missing 'predicted'", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(observed = observed), - 'argument "predicted" is missing, with no default' - ) -}) - -test_that("bias_sample() works for integer observed and predicted", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(10, rpois(10, lambda = 1:10)) - output <- bias_sample( - observed = observed, - predicted = predicted - ) - expect_equal( - length(output), - length(observed) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works for continuous observed values and predicted", { - observed <- rnorm(10) - predicted <- replicate(10, rnorm(10)) - output <- bias_sample( - observed = observed, - predicted = predicted - ) - expect_equal( - length(output), - length(observed) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works as expected", { - observed <- rpois(30, lambda = 1:30) - predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) - expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted))) - - ## continuous forecasts - observed <- rnorm(30, mean = 1:30) - predicted <- replicate(200, rnorm(30, mean = 1:30)) - - scoringutils2 <- bias_sample(observed, predicted) - scoringutils <- bias_sample(observed, predicted) - - expect_equal(scoringutils, scoringutils2) -}) - -test_that("bias_quantile() handles NA values", { - predicted <- c(NA, 1, 2, 3) - quantiles <- c(0.1, 0.5, 0.9) - - expect_error( - bias_quantile(observed = 2, predicted, quantiles), - "`predicted` and `quantile` must have the same length" - ) -}) - -test_that("bias_quantile() returns NA if no predictions", { - expect_true(is.na(bias_quantile(observed = 2, numeric(0), numeric(0)))) -}) - -test_that("bias_quantile() returns correct bias if value below the median", { - predicted <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value above median", { - predicted <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value at the median", { - predicted <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0) -}) - -test_that("bias_quantile() returns 1 if true value below min prediction", { - predicted <- c(2, 3, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - - suppressMessages( - expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1) - ) -}) - -test_that("bias_quantile() returns -1 if true value above max prediction", { - predicted <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1) -}) - -test_that("bias_quantile(): quantiles must be between 0 and 1", { - predicted <- 1:4 - - # Failing example - quantiles <- c(-0.1, 0.3, 0.5, 0.8) - expect_error(bias_quantile(observed = 3, predicted, quantiles), - "quantiles must be between 0 and 1") - - # Passing counter example - quantiles <- c(0.1, 0.3, 0.5, 0.8) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_quantile(): quantiles must be increasing", { - predicted <- 1:4 - - # Failing example - quantiles <- c(0.8, 0.3, 0.5, 0.9) - expect_error(bias_quantile(observed = 3, predicted, quantiles), - "quantiles must be increasing") - - # Passing counter example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_quantile(): predictions must be increasing", { - predicted <- c(1, 2, 4, 3) - quantiles <- c(0.1, 0.3, 0.5, 0.9) - - expect_error( - bias_quantile(observed = 3, predicted, quantiles), - "predictions must be increasing" - ) - expect_silent(bias_quantile( observed = 3, 1:4, quantiles)) -}) - -test_that("bias_quantile(): quantiles must be unique", { - predicted <- 1:4 - - # Failing example - quantiles <- c(0.3, 0.3, 0.5, 0.8) - expect_error(bias_quantile(observed = 3, predicted, quantiles), - "quantiles must be increasing") - - # Passing example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_sample() approx equals bias_quantile() for many samples", { - set.seed(123) - - # Generate true value - observed <- 3 - - # Generate many sample predictions - predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000) - - # Get sample based bias - bias_sample_result <- bias_sample( - observed, matrix(predicted, nrow = 1) - ) - - # Convert predictions to quantiles - quantiles <- seq(0, 1, length.out = 100) - quantile_preds <- quantile(predicted, probs = quantiles) - - # Get quantile based bias - bias_quantile_result <- suppressMessages( - bias_quantile(observed, quantile_preds, quantiles) - ) - - # Difference should be small - expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) -}) - -test_that("bias_quantile() and bias_range() give the same result", { - predicted <- sort(rnorm(23)) - lower <- rev(predicted[1:12]) - upper <- predicted[12:23] - - range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) - quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) - observed <- rnorm(1) - - range_bias <- bias_range( - lower = lower, upper = upper, - range = range, observed = observed - ) - range_quantile <- bias_quantile( - observed = observed, - predicted = predicted, - quantile = quantiles - ) - expect_equal(range_bias, range_quantile) -}) - -test_that("bias_range() works with point forecasts", { - predicted <- 1 - observed <- 1 - range <- c(0) - - expect_equal(bias_range(predicted, predicted, range, observed), 0) -}) - -test_that("bias_range(): ranges must be between 0 and 100", { - lower <- 4:1 - upper <- 5:8 - - # Failing example - range <- c(-10, 0, 10, 20) - expect_error( - bias_range(lower, upper, range, observed = 3), - "range must be between 0 and 100" - ) - - # Passing counter example - range <- c(0, 10, 20, 30) - expect_silent(bias_range(lower, upper, range, observed = 3)) -}) - diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index ecfc86653..98d784cd4 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -38,46 +38,55 @@ test_that("function transform_forecasts works", { }) +# ============================================================================ # +# `set_forecast_unit()` +# ============================================================================ # test_that("function set_forecast_unit() works", { - # some columns in the example data have duplicated information. So we can remove # these and see whether the result stays the same. - - scores1 <- suppressMessages(score(example_quantile)) - scores1 <- scores1[order(location, target_end_date, target_type, horizon, model), ] + scores1 <- scores_quantile[order(location, target_end_date, target_type, horizon, model), ] ex2 <- set_forecast_unit( example_quantile, c("location", "target_end_date", "target_type", "horizon", "model") ) - scores2 <- suppressMessages(score(ex2)) + scores2 <- score(ex2) scores2 <- scores2[order(location, target_end_date, target_type, horizon, model), ] expect_equal(scores1$interval_score, scores2$interval_score) }) +test_that("set_forecast_unit() works on input that's not a data.table", { + df <- data.frame( + a = 1:2, + b = 2:3, + c = 3:4 + ) + expect_equal( + colnames(set_forecast_unit(df, c("a", "b"))), + c("a", "b") + ) + # apparently it also works on a matrix... good to know :) + expect_equal( + names(set_forecast_unit(as.matrix(df), "a")), + "a" + ) +}) -test_that("function set_forecast_unit() gives warning when column is not there", { +test_that("function set_forecast_unit() gives warning when column is not there", { expect_warning( set_forecast_unit( example_quantile, - c("location", "target_end_date", "target_type", "horizon", "model", "test") + c("location", "target_end_date", "target_type", "horizon", "model", "test1", "test2") ) ) }) - test_that("function get_forecast_unit() and set_forecast_unit() work together", { - fu_set <- c("location", "target_end_date", "target_type", "horizon", "model") - - ex <- set_forecast_unit( - example_binary, - fu_set - ) - + ex <- set_forecast_unit(example_binary, fu_set) fu_get <- get_forecast_unit(ex) expect_equal(fu_set, fu_get) }) diff --git a/tests/testthat/test-get_-functions.R b/tests/testthat/test-get_-functions.R index 217e954bd..0a499b243 100644 --- a/tests/testthat/test-get_-functions.R +++ b/tests/testthat/test-get_-functions.R @@ -70,3 +70,42 @@ test_that("get_type() handles `NA` values", { expect_equal(get_type(c(1, NA, 3.2)), "continuous") expect_error(get_type(NA), "Can't get type: all values of are NA") }) + + +# `get_duplicate_forecasts()` ================================================== +test_that("get_duplicate_forecasts() works as expected for quantile", { + expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for sample", { + expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))), + 22 + ) +}) + + +test_that("get_duplicate_forecasts() works as expected for binary", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for point", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))), + 22 + ) +}) diff --git a/tests/testthat/test-get_duplicate_forecasts.R b/tests/testthat/test-get_duplicate_forecasts.R deleted file mode 100644 index 5487ea0fb..000000000 --- a/tests/testthat/test-get_duplicate_forecasts.R +++ /dev/null @@ -1,37 +0,0 @@ -test_that("get_duplicate_forecasts() works as expected for quantile", { - expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))), - 22 - ) -}) - -test_that("get_duplicate_forecasts() works as expected for sample", { - expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))), - 22 - ) -}) - - -test_that("get_duplicate_forecasts() works as expected for binary", { - expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))), - 22 - ) -}) - -test_that("get_duplicate_forecasts() works as expected for point", { - expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))), - 22 - ) -}) - diff --git a/tests/testthat/test-interval_score.R b/tests/testthat/test-interval_score.R deleted file mode 100644 index 5f156334b..000000000 --- a/tests/testthat/test-interval_score.R +++ /dev/null @@ -1,428 +0,0 @@ -test_that("wis works, median only", { - y <- c(1, -15, 22) - lower <- upper <- c(1, 2, 3) - quantile_probs <- 0.5 - - actual <- interval_score(y, - lower = lower, upper = upper, - weigh = TRUE, - interval_range = 0 - ) - expected <- abs(y - lower) - - expect_identical(actual, expected) -}) - -test_that("WIS works within score for median forecast", { - test_data <- data.frame( - observed = c(1, -15, 22), - predicted = 1:3, - quantile = rep(c(0.5), each = 3), - model = "model1", - date = 1:3 - ) - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - expect_equal(eval$ae_median, eval$interval_score) -}) - -test_that("wis works, 1 interval only", { - y <- c(1, -15, 22) - lower <- c(0, 1, 0) - upper <- c(2, 2, 3) - quantile_probs <- c(0.25, 0.75) - - alpha <- 0.5 - - actual <- scoringutils::interval_score(y, - lower = lower, upper = upper, - weigh = TRUE, - interval_range = 50 - ) - expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - - expect_identical(actual, expected) -}) - -test_that("WIS works within score for one interval", { - test_data <- data.frame( - observed = rep(c(1, -15, 22), times = 2), - quantile = rep(c(0.25, 0.75), each = 3), - predicted = c(c(0, 1, 0), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 2) - ) - - eval <- suppressMessages(scoringutils::score(test_data, - count_median_twice = TRUE - )) - - eval <- summarise_scores(eval, by = c("model", "date")) - - lower <- c(0, 1, 0) - upper <- c(2, 2, 3) - alpha <- 0.5 - - expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - - expect_equal(expected, eval$interval_score) -}) - -test_that("wis works, 1 interval and median", { - test_data <- data.frame( - observed = rep(c(1, -15, 22), times = 3), - quantile = rep(c(0.25, 0.5, 0.75), each = 3), - predicted = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 3) - ) - - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - - eval <- summarise_scores(eval, by = c("model", "date")) - - y <- c(1, -15, 22) - quantile <- rbind(c(0, 1, 2), c(1, 2, 2), c(0, 3, 3)) - quantile_probs <- c(0.25, 0.5, 0.75) - - alpha <- 0.5 - - expected <- 0.5 * ( - abs(y - quantile[, 2]) + - (quantile[, 3] - quantile[, 1]) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_identical(eval$interval_score, expected) -}) - -test_that("wis works, 2 intervals and median", { - test_data <- data.frame( - observed = rep(c(1, -15, 22), times = 5), - quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), - predicted = c( - c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), - c(2, 2, 3), c(3, 4, 4) - ), - model = c("model1"), - date = rep(1:3, times = 5) - ) - - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - - eval <- summarise_scores(eval, by = c("model", "date")) - - y <- c(1, -15, 22) - quantile <- rbind(c(-1, 0, 1, 2, 3), c(-2, 1, 2, 2, 4), c(-2, 0, 3, 3, 4)) - quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - - alpha1 <- 0.2 - alpha2 <- 0.5 - - expected <- (1 / 3) * ( - abs(y - quantile[, 3]) + - (quantile[, 5] - quantile[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + - (quantile[, 4] - quantile[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_equal( - as.numeric(eval$interval_score), - as.numeric(expected) - ) -}) - -# additional tests from the covidhubutils repo -test_that("wis is correct, median only - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] - forecast_quantile_probs <- forecast_quantile_probs[3] - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "observed") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "predicted") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE - ) - - expected <- abs(y - forecast_quantiles_matrix[, 1]) - - expect_equal(eval$interval_score, expected) -}) - - -test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] - forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "observed") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "predicted") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- suppressMessages(scoringutils::score(data_formatted, - count_median_twice = FALSE - )) - - eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) - ) - - alpha1 <- 0.2 - expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + - c(0, (-2) - (-15), 22 - 4) - - expect_equal(eval$interval_score, expected) -}) - - -test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "observed") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "predicted") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE - ) - - eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) - ) - - alpha1 <- 0.2 - alpha2 <- 0.5 - expected <- (1 / 2.5) * ( - 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + - (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + - (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_equal(eval$interval_score, expected) -}) - -test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { - observed <- rnorm(10, mean = 1:10) - alphas <- c(0.1, 0.5, 0.9) - - for (alpha in alphas) { - lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) - upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) - - w <- FALSE - is <- interval_score( - observed = observed, - lower = lower, - upper = upper, - interval_range = (1 - alpha) * 100, - weigh = w - ) - - qs_lower <- quantile_score(observed, - predicted = lower, - quantile = alpha / 2, - weigh = w - ) - qs_upper <- quantile_score(observed, - predicted = upper, - quantile = 1 - alpha / 2, - weigh = w - ) - expect_equal((qs_lower + qs_upper) / 2, is) - } -}) - -test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { - observed <- rnorm(10, mean = 1:10) - alphas <- c(0.1, 0.5, 0.9) - - for (alpha in alphas) { - lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) - upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) - - w <- TRUE - is <- interval_score( - observed = observed, - lower = lower, - upper = upper, - interval_range = (1 - alpha) * 100, - weigh = w - ) - - qs_lower <- quantile_score(observed, - predicted = lower, - quantile = alpha / 2, - weigh = w - ) - qs_upper <- quantile_score(observed, - predicted = upper, - quantile = 1 - alpha / 2, - weigh = w - ) - expect_equal((qs_lower + qs_upper) / 2, is) - } -}) diff --git a/tests/testthat/test-lower-level-check-functions.R b/tests/testthat/test-metrics-binary.R similarity index 52% rename from tests/testthat/test-lower-level-check-functions.R rename to tests/testthat/test-metrics-binary.R index 8d73aa528..9c49e7050 100644 --- a/tests/testthat/test-lower-level-check-functions.R +++ b/tests/testthat/test-metrics-binary.R @@ -1,45 +1,13 @@ -test_that("Lower-level input check functions work", { - observed <- rpois(30, lambda = 1:30) - predicted <- replicate(20, rpois(n = 30, lambda = 1:30)) - expect_equal(length(crps_sample(observed, predicted)), 30) - - # should error when wrong prediction type is given - predicted2 <- rpois(30, lambda = 1) - expect_error(crps_sample(observed, predicted2), - "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'", - fixed = TRUE - ) - - # predictions have wrong number of rows - predicted3 <- replicate(20, rpois(n = 31, lambda = 1)) - expect_error( - crps_sample(observed, predicted3), - "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.", - # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.", - fixed = TRUE - ) - - # error with missing argument - expect_error(crps_sample(predicted = predicted), - 'argument "observed" is missing, with no default', - fixed = TRUE - ) - - # checks work for binary forecasts - observed <- factor(sample(c(0, 1), size = 10, replace = TRUE)) - predicted <- runif(n = 10) - expect_equal(length(brier_score(observed, predicted)), 10) - - # predictions are not between 0 and 1 - predicted2 <- predicted + 2 - expect_error( - brier_score(observed, predicted2), - "Assertion on 'predicted' failed: Element 1 is not <= 1.", - fixed = TRUE - ) -}) - - +observed <- factor(rbinom(10, size = 1, prob = 0.5)) +predicted <- c(0.425, 0.55, 0.541, 0.52, 0.13, 0.469, 0.86, 0.22, 0.74, 0.9) +df <- data.table( + observed = observed, + predicted = predicted, + model = "m1", + id = 1:10 +) + +# test input handling test_that("function throws an error when missing observed or predicted", { observed <- sample(c(0, 1), size = 10, replace = TRUE) predicted <- replicate( @@ -83,17 +51,6 @@ test_that("function throws an error for wrong format of `observed`", { }) test_that("function throws an error for wrong format of predictions", { - observed <- factor(sample(c(0, 1), size = 10, replace = TRUE)) - predicted <- runif(10, min = 0, max = 3) - expect_error( - brier_score( - observed = observed, - predicted = predicted - ), - #"For a binary forecast, all predictions should be probabilities between 0 or 1." - "Assertion on 'predicted' failed: Element 1 is not <= 1." - ) - predicted <- runif(10, min = 0, max = 1) expect_error( brier_score( @@ -114,3 +71,59 @@ test_that("function throws an error for wrong format of predictions", { fixed = TRUE ) }) + +test_that("Input checking for binary forecasts works", { + # everything correct + expect_no_condition( + scoringutils:::assert_input_binary(observed, predicted) + ) + + # predicted > 1 + expect_error( + scoringutils:::assert_input_binary(observed, predicted + 1), + "Assertion on 'predicted' failed: Element 1 is not <= 1." + ) + + # predicted < 0 + expect_error( + scoringutils:::assert_input_binary(observed, predicted - 1), + "Assertion on 'predicted' failed: Element 1 is not >= 0." + ) + + # observed value not factor + expect_error( + scoringutils:::assert_input_binary(1:10, predicted), + "Assertion on 'observed' failed: Must be of type 'factor', not 'integer'." + ) + + # observed value has not 2 levels + expect_error( + scoringutils:::assert_input_binary(factor(1:10), predicted), + "Assertion on 'observed' failed: Must have exactly 2 levels." + ) + + # observed is a single number and does not have the same length as predicted + expect_error( + scoringutils:::assert_input_binary(factor(1), predicted), + "`observed` and `predicted` need to be of same length when scoring binary forecasts", + fixed = TRUE + ) + + # predicted is a matrix + expect_error( + scoringutils:::assert_input_binary(observed, matrix(predicted)), + "Assertion on 'predicted' failed: Must be of type 'atomic vector', not 'matrix'." + ) +}) + +test_that("Binary metrics work within and outside of `score()`", { + result <- score(df) + expect_equal( + brier_score(observed, predicted), + result$brier_score + ) + expect_equal( + logs_binary(observed, predicted), + result$log_score + ) +}) diff --git a/tests/testthat/test-absolute_error.R b/tests/testthat/test-metrics-point.R similarity index 82% rename from tests/testthat/test-absolute_error.R rename to tests/testthat/test-metrics-point.R index f61493b25..2f64226df 100644 --- a/tests/testthat/test-absolute_error.R +++ b/tests/testthat/test-metrics-point.R @@ -1,25 +1,20 @@ -test_that("absolute error (sample based) works", { - observed <- rnorm(30, mean = 1:30) - predicted_values <- rnorm(30, mean = 1:30) - - scoringutils <- ae_median_sample(observed, predicted_values) - - ae <- abs(observed - predicted_values) - expect_equal(ae, scoringutils) -}) - - -# covidHubUtils-tests +# covidHubUtils-tests on absolute error ======================================== +# test are adapted from the package +# covidHubUtils, https://github.com/reichlab/covidHubUtils/ +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) +horizons <- c("1", "2", "1") +locations <- c("01", "01", "02") +target_variables <- rep("inc death", length(y)) test_that("abs error is correct within score, point forecast only", { - # test is adapted from the package covidHubUtils, https://github.com/reichlab/covidHubUtils/ - y <- c(1, -15, 22) - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1) forecast_horizons <- rep(horizons, times = 1) @@ -73,21 +68,9 @@ test_that("abs error is correct within score, point forecast only", { }) test_that("abs error is correct, point and median forecasts different", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -145,21 +128,9 @@ test_that("abs error is correct, point and median forecasts different", { }) test_that("abs error is correct, point and median forecasts same", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -196,7 +167,6 @@ test_that("abs error is correct, point and median forecasts same", { stringsAsFactors = FALSE ) - # bring in scoringutils format truth_scoringutils <- data.table::as.data.table(test_truth) fc_scoringutils <- data.table::as.data.table(test_forecasts) @@ -221,7 +191,5 @@ test_that("abs error is correct, point and median forecasts same", { ) expected <- abs(y - point_forecast) - - # expect_equal(actual$abs_error, expected) expect_equal(eval$ae_point, expected) }) diff --git a/tests/testthat/test-metrics-quantile.R b/tests/testthat/test-metrics-quantile.R new file mode 100644 index 000000000..8dd6d6a22 --- /dev/null +++ b/tests/testthat/test-metrics-quantile.R @@ -0,0 +1,793 @@ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +# covidHubUtils test: +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + +# ============================================================================ # +# Input handling =============================================================== +# ============================================================================ # +test_that("Input checking for quantile forecasts works", { + # everything correct + expect_no_condition( + scoringutils:::assert_input_quantile(observed, predicted, quantile) + ) + + # quantile > 1 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile + 1), + "Assertion on 'quantile' failed: Element 1 is not <= 1." + ) + + # quantile < 0 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile - 1), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # 10 observations, but only 3 forecasts + expect_error( + scoringutils:::assert_input_quantile(1:10, predicted, quantile), + "Assertion on 'predicted' failed: Must have exactly 10 rows, but has 3 rows." + ) + + # observed value is a factor + expect_error( + scoringutils:::assert_input_quantile(factor(1:10), predicted, quantile), + "Assertion on 'observed' failed: Must be of type 'numeric', not 'factor'." + ) + + # observed is a single number and does not have the same length as predicted + # There seems to be an issue with the error message: there is one \n to many + # such that the test fails when executed alone, but works when executed + # together with others. + expect_error( + scoringutils:::assert_input_quantile(1, predicted, quantile), + "Assertion failed. One of the following must apply:\n * check_numeric_vector(predicted): Must be of type 'atomic vector',\n * not 'matrix'\n * check_matrix(predicted): Must have exactly 1 rows, but has 3 rows", + fixed = TRUE + ) + + # predicted is a vector + expect_error( + scoringutils:::assert_input_quantile(observed, as.vector(predicted), quantile), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'double'." + ) +}) + + +# ============================================================================ # +# wis ========================================================================== +# ============================================================================ #a +test_that("wis works standalone, median only", { + y <- c(1, -15, 22) + lower <- upper <- predicted_quantile <- c(1, 2, 3) + quantile_probs <- 0.5 + + actual <- interval_score(y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 0 + ) + + actual_wis <- wis( + observed = y, + predicted = matrix(predicted_quantile), + quantile = quantile_probs, + ) + + expected <- abs(y - lower) + + expect_identical(actual, expected) +}) + +test_that("`wis()` works within score for median forecast", { + test_data <- data.frame( + observed = c(1, -15, 22), + predicted = 1:3, + quantile = rep(c(0.5), each = 3), + model = "model1", + date = 1:3 + ) + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + expect_equal(eval$ae_median, eval$wis) +}) + +test_that("`wis()` equals `interval_score()`, 1 interval only", { + y <- c(1, -15, 22) + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) + quantile_probs <- c(0.25, 0.75) + predicted <- matrix(c(lower, upper), ncol = 2) + + alpha <- 0.5 + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + + actual <- interval_score( + y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 50 + ) + + actual_wis <- wis( + observed = y, + predicted = predicted, + quantile = quantile_probs, + ) + + expect_identical(actual, expected) + expect_identical(actual_wis, expected) +}) + +test_that("wis() works within score for one interval", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 2), + quantile = rep(c(0.25, 0.75), each = 3), + predicted = c(c(0, 1, 0), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 2) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = list(wis = wis) + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) + alpha <- 0.5 + + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + + expect_equal(expected, eval$wis) +}) + +test_that("`wis()` works 1 interval and median", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 3), + quantile = rep(c(0.25, 0.5, 0.75), each = 3), + predicted = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 3) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + y <- c(1, -15, 22) + quantile <- rbind(c(0, 1, 2), c(1, 2, 2), c(0, 3, 3)) + quantile_probs <- c(0.25, 0.5, 0.75) + + alpha <- 0.5 + expected <- 0.5 * ( + abs(y - quantile[, 2]) + + (quantile[, 3] - quantile[, 1]) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = quantile, + quantile = quantile_probs, + count_median_twice = TRUE + ) + + expect_identical(eval$wis, expected) + expect_identical(actual_wis, expected) +}) + +test_that("wis works, 2 intervals and median", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 5), + quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), + predicted = c( + c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), + c(2, 2, 3), c(3, 4, 4) + ), + model = c("model1"), + date = rep(1:3, times = 5) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + quantile <- forecast_quantiles_matrix + quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + alpha1 <- 0.2 + alpha2 <- 0.5 + + expected <- (1 / 3) * ( + abs(y - quantile[, 3]) + + (quantile[, 5] - quantile[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (quantile[, 4] - quantile[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = quantile, + quantile = quantile_probs, + count_median_twice = TRUE + ) + + expect_equal( + as.numeric(eval$wis), + as.numeric(expected) + ) + expect_identical(actual_wis, expected) +}) + +# additional tests from the covidhubutils repo +test_that("wis is correct, median only - test corresponds to covidHubUtils", { + forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[3] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- score( + data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov + ) + + expected <- abs(y - forecast_quantiles_matrix[, 1]) + + actual_wis <- wis( + observed = y, + predicted = matrix(forecast_quantiles_matrix), + quantile = 0.5, + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) + expect_equal(actual_wis, expected) +}) + +test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { + forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- suppressMessages(score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov_no_ae + )) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + + c(0, (-2) - (-15), 22 - 4) + + actual_wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = c(0.1, 0.9), + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) + expect_equal(actual_wis, expected) +}) + +test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + alpha2 <- 0.5 + expected <- (1 / 2.5) * ( + 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + + (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = c(0.1, 0.25, 0.5, 0.75, 0.9), + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) +}) + +test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { + observed <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 11:20)) + + w <- FALSE + is <- interval_score( + observed = observed, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + wis <- wis( + observed = observed, + predicted = cbind(lower, upper), + quantile = c(alpha / 2, 1 - alpha / 2), + count_median_twice = FALSE, + weigh = w + ) + + qs_lower <- quantile_score(observed, + predicted = lower, + quantile = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(observed, + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + expect_equal(wis, is) + } +}) + + +# ============================================================================ # +# Quantile score ============================================================= # +# ============================================================================ # +test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { + observed <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 11:20)) + + w <- TRUE + is <- interval_score( + observed = observed, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + wis <- wis( + observed = observed, + predicted = cbind(lower, upper), + quantile = c(alpha / 2, 1 - alpha / 2), + count_median_twice = FALSE, + weigh = w + ) + + qs_lower <- quantile_score(observed, + predicted = lower, + quantile = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(observed, + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + expect_equal(wis, is) + } +}) + +test_that("wis works with separate results", { + wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = forecast_quantile_probs, + separate_results = TRUE + ) + expect_equal(wis$wis, wis$dispersion + wis$overprediction + wis$underprediction) +}) + + +# ============================================================================ # +# overprediction, underprediction, dispersion ================================ # +# ============================================================================ # +test_that("wis is the sum of overprediction, underprediction, dispersion", { + wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = forecast_quantile_probs + ) + + d <- dispersion(y, forecast_quantiles_matrix, forecast_quantile_probs) + o <- overprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + u <- underprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + + expect_equal(wis, d + o + u) +}) + + +# ============================================================================ # +# `interval_coverage_quantile` =============================================== # +# ============================================================================ # +test_that("interval_coverage_quantile works", { + expect_equal( + interval_coverage_quantile(observed, predicted, quantile, range = 50), + c(TRUE, FALSE, FALSE) + ) +}) + +test_that("interval_coverage_quantile rejects wrong inputs", { + expect_error( + interval_coverage_quantile(observed, predicted, quantile, range = c(50, 0)), + "Assertion on 'range' failed: Must have length 1." + ) +}) + + +# ============================================================================ # +# `interval_coverage_deviation_quantile` ===================================== # +# ============================================================================ # +test_that("interval_coverage_deviation_quantile works", { + existing_ranges <- unique(get_range_from_quantile(quantile)) + expect_equal(existing_ranges, c(80, 50, 0)) + + cov_50 <- interval_coverage_quantile(observed, predicted, quantile, range = c(50)) + cov_80 <- interval_coverage_quantile(observed, predicted, quantile, range = c(80)) + manual <- 0.5 * (cov_50 - 0.5) + 0.5 * (cov_80 - 0.8) + + expect_equal( + interval_coverage_deviation_quantile(observed, predicted, quantile), + manual + ) +}) + + +# ============================================================================ # +# `bias_quantile` ============================================================== +# ============================================================================ # +test_that("bias_quantile() works as expected", { + predicted <- c(1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + 0 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + + predicted <- c( + 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, + 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, + 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, + 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 + ) + + quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + + observed <- 8062 + expect_equal(bias_quantile(observed, predicted, quantile), -0.8) +}) + +test_that("bias_quantile handles matrix input", { + observed <- seq(10, 0, length.out = 4) + predicted <- matrix(1:12, ncol = 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed, predicted, quantiles), + c(-1.0, -0.8, 0.8, 1.0) + ) +}) + + +test_that("bias_quantile() handles vector that is too long", { + predicted <- c(NA, 1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 2, predicted, quantiles), + "Assertion on 'quantile' failed: Must have length 4, but has length 3." + ) +}) + +test_that("bias_quantile() handles NA values", { + predicted <- c(NA, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, NA) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -1 + ) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE), + NA_real_ + ) +}) + +test_that("bias_quantile() errors if no predictions", { + expect_error( + bias_quantile(observed = 2, numeric(0), numeric(0)), + "Assertion on 'quantile' failed: Must have length >= 1, but has length 0" + ) +}) + +test_that("bias_quantile() returns correct bias if value below the median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value above median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value at the median", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0) +}) + +test_that("bias_quantile() returns 1 if true value below min prediction", { + predicted <- c(2, 3, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1) + ) +}) + +test_that("bias_quantile() returns -1 if true value above max prediction", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1) +}) + +test_that("bias_quantile(): quantiles must be between 0 and 1", { + predicted <- 1:4 + + # Failing example + quantiles <- c(-0.1, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # Passing counter example + quantiles <- c(0.1, 0.3, 0.5, 0.8) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be increasing", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.8, 0.3, 0.5, 0.9) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + + # Passing counter example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): predictions must be increasing", { + predicted <- c(1, 2, 4, 3) + quantiles <- c(0.1, 0.3, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + expect_silent(bias_quantile( observed = 3, 1:4, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be unique", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.3, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Contains duplicated values, position 2." + ) + + # Passing example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) diff --git a/tests/testthat/test-metrics-range.R b/tests/testthat/test-metrics-range.R new file mode 100644 index 000000000..bd0290e9f --- /dev/null +++ b/tests/testthat/test-metrics-range.R @@ -0,0 +1,45 @@ +test_that("bias_quantile() and bias_range() give the same result", { + predicted <- sort(rnorm(23)) + lower <- rev(predicted[1:12]) + upper <- predicted[12:23] + + range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) + quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + observed <- rnorm(1) + + range_bias <- bias_range( + lower = lower, upper = upper, + range = range, observed = observed + ) + range_quantile <- bias_quantile( + observed = observed, + predicted = predicted, + quantile = quantiles + ) + expect_equal(range_bias, range_quantile) +}) + +test_that("bias_range() works with point forecasts", { + predicted <- 1 + observed <- 1 + range <- c(0) + + expect_equal(bias_range(predicted, predicted, range, observed), 0) +}) + +test_that("bias_range(): ranges must be between 0 and 100", { + lower <- 4:1 + upper <- 5:8 + + # Failing example + range <- c(-10, 0, 10, 20) + expect_error( + bias_range(lower, upper, range, observed = 3), + "range must be between 0 and 100" + ) + + # Passing counter example + range <- c(0, 10, 20, 30) + expect_silent(bias_range(lower, upper, range, observed = 3)) +}) + diff --git a/tests/testthat/test-metrics-sample.R b/tests/testthat/test-metrics-sample.R new file mode 100644 index 000000000..ded7b52ae --- /dev/null +++ b/tests/testthat/test-metrics-sample.R @@ -0,0 +1,146 @@ +test_that("Input handling", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(20, rpois(n = 30, lambda = 1:30)) + expect_equal(length(crps_sample(observed, predicted)), 30) + + # should error when wrong prediction type is given + predicted2 <- rpois(30, lambda = 1) + expect_error(crps_sample(observed, predicted2), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'", + fixed = TRUE + ) + + # predictions have wrong number of rows + predicted3 <- replicate(20, rpois(n = 31, lambda = 1)) + expect_error( + crps_sample(observed, predicted3), + "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.", + # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.", + fixed = TRUE + ) + + # error with missing argument + expect_error(crps_sample(predicted = predicted), + 'argument "observed" is missing, with no default', + fixed = TRUE + ) +}) + + + +test_that("bias_sample() throws an error when missing observed", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(predicted = predicted), + 'argument "observed" is missing, with no default' + ) +}) + +test_that("bias_sample() throws an error when missing 'predicted'", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(observed = observed), + 'argument "predicted" is missing, with no default' + ) +}) + +test_that("bias_sample() works for integer observed and predicted", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(10, rpois(10, lambda = 1:10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works for continuous observed values and predicted", { + observed <- rnorm(10) + predicted <- replicate(10, rnorm(10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works as expected", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) + expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted))) + + ## continuous forecasts + observed <- rnorm(30, mean = 1:30) + predicted <- replicate(200, rnorm(30, mean = 1:30)) + + scoringutils2 <- bias_sample(observed, predicted) + scoringutils <- bias_sample(observed, predicted) + + expect_equal(scoringutils, scoringutils2) +}) + + +test_that("bias_sample() approx equals bias_quantile() for many samples", { + set.seed(123) + + # Generate true value + observed <- 3 + + # Generate many sample predictions + predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000) + + # Get sample based bias + bias_sample_result <- bias_sample( + observed, matrix(predicted, nrow = 1) + ) + + # Convert predictions to quantiles + quantiles <- seq(0, 1, length.out = 100) + quantile_preds <- quantile(predicted, probs = quantiles) + + # Get quantile based bias + bias_quantile_result <- suppressMessages( + bias_quantile(observed, quantile_preds, quantiles) + ) + + # Difference should be small + expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) +}) + + +# `ae_median_sample` =========================================================== +test_that("ae_median_sample works", { + observed <- rnorm(30, mean = 1:30) + predicted_values <- rnorm(30, mean = 1:30) + scoringutils <- ae_median_sample(observed, matrix(predicted_values)) + ae <- abs(observed - predicted_values) + expect_equal(ae, scoringutils) +}) + +# `mad_sample()` =============================================================== +test_that("function throws an error when missing 'predicted'", { + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + mad_sample() + ) +}) + diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R index 3d0120adb..74275a06a 100644 --- a/tests/testthat/test-pairwise_comparison.R +++ b/tests/testthat/test-pairwise_comparison.R @@ -53,7 +53,7 @@ test_that("pairwise_comparison() works", { ) # evaluate the toy forecasts, once with and once without a baseline model specified - eval <- suppressMessages(score(data_formatted)) + eval <- score(data_formatted) # check with relative skills eval_without_rel_skill <- summarise_scores( @@ -85,7 +85,7 @@ test_that("pairwise_comparison() works", { # prepare scores for the code Johannes Bracher wrote scores_johannes <- data.table::copy(eval_without_baseline) # doesn't matter which one data.table::setnames(scores_johannes, - old = c("location", "target_end_date", "interval_score"), + old = c("location", "target_end_date", "wis"), new = c("unit", "timezero", "wis") ) @@ -238,7 +238,7 @@ test_that("pairwise_comparison() works", { model = rep(c("model1", "model2", "model3"), each = 10), date = as.Date("2020-01-01") + rep(1:5, each = 2), location = c(1, 2), - interval_score = (abs(rnorm(30))), + wis = (abs(rnorm(30))), ae_median = (abs(rnorm(30))) ) diff --git a/tests/testthat/test-plot_heatmap.R b/tests/testthat/test-plot_heatmap.R index 9118ff21f..d6453bd45 100644 --- a/tests/testthat/test-plot_heatmap.R +++ b/tests/testthat/test-plot_heatmap.R @@ -1,9 +1,7 @@ library(ggplot2, quietly = TRUE) test_that("plot_heatmap() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "target_type", "range")) - ) + scores <- summarise_scores(scores_quantile, by = c("model", "target_type")) p <- plot_heatmap(scores, x = "target_type", metric = "bias") expect_s3_class(p, "ggplot") skip_on_cran() diff --git a/tests/testthat/test-plot_interval_coverage.R b/tests/testthat/test-plot_interval_coverage.R index 04f203b03..0e885219f 100644 --- a/tests/testthat/test-plot_interval_coverage.R +++ b/tests/testthat/test-plot_interval_coverage.R @@ -1,11 +1,8 @@ -library(ggplot2, quietly = TRUE) - test_that("plot_interval_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "range")) - ) - p <- plot_interval_coverage(scores) + coverage <- add_coverage(example_quantile) |> + summarise_scores(by = c("model", "range")) + p <- plot_interval_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_interval_coverage", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_interval_coverage", p)) }) diff --git a/tests/testthat/test-plot_quantile_coverage.R b/tests/testthat/test-plot_quantile_coverage.R index 6c3593c04..060b9be26 100644 --- a/tests/testthat/test-plot_quantile_coverage.R +++ b/tests/testthat/test-plot_quantile_coverage.R @@ -1,9 +1,9 @@ test_that("plot_quantile_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "quantile")) - ) - p <- plot_quantile_coverage(scores) + coverage <- add_coverage(example_quantile) |> + summarise_scores(by = c("model", "quantile")) + + p <- plot_quantile_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_quantile_coverage", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_quantile_coverage", p)) }) diff --git a/tests/testthat/test-plot_ranges.R b/tests/testthat/test-plot_ranges.R index e9ae5575b..b4dec124e 100644 --- a/tests/testthat/test-plot_ranges.R +++ b/tests/testthat/test-plot_ranges.R @@ -1,13 +1,18 @@ -sum_scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "target_type", "range")) -) +m <- modifyList(metrics_no_cov_no_ae, list("bias" = NULL)) + +sum_scores <- copy(example_quantile) %>% + .[, interval_range := scoringutils:::get_range_from_quantile(quantile)] |> + score(metrics = m) |> + summarise_scores(by = c("model", "target_type", "interval_range")) + +sum_scores[, range := interval_range] test_that("plot_ranges() works as expected with interval score", { p <- plot_ranges(sum_scores, x = "model") + - facet_wrap(~target_type, scales = "free") + facet_wrap(~target_type, scales = "free") expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_interval", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_interval", p)) }) test_that("plot_ranges() works as expected with dispersion", { @@ -15,5 +20,5 @@ test_that("plot_ranges() works as expected with dispersion", { facet_wrap(~target_type) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_dispersion", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_dispersion", p)) }) diff --git a/tests/testthat/test-plot_score_table.R b/tests/testthat/test-plot_score_table.R index 8336de7a9..5ffc0b029 100644 --- a/tests/testthat/test-plot_score_table.R +++ b/tests/testthat/test-plot_score_table.R @@ -1,7 +1,6 @@ test_that("plot_score_table() works as expected", { p <- suppressMessages( scores_quantile %>% - add_coverage(by = c("model")) %>% summarise_scores(by = c("model")) %>% summarise_scores(by = c("model"), fun = signif, digits = 1) %>% plot_score_table() diff --git a/tests/testthat/test-score.R b/tests/testthat/test-score.R index 9b4040723..a67575984 100644 --- a/tests/testthat/test-score.R +++ b/tests/testthat/test-score.R @@ -168,17 +168,13 @@ test_that("score.scoringutils_point() errors with only NA values", { test_that("score_quantile correctly handles separate results = FALSE", { df <- example_quantile[model == "EuroCOVIDhub-ensemble" & target_type == "Cases" & location == "DE"] - eval <- suppressMessages( - score( - df[!is.na(predicted)], - separate_results = FALSE - ) - ) + eval <- score(df[!is.na(predicted)], separate_results = FALSE) expect_equal( nrow(eval) > 1, TRUE ) + expect_true(all(names(metrics_quantile) %in% colnames(eval))) }) @@ -191,10 +187,12 @@ test_that("score() quantile produces desired metrics", { quantile = rep(c(0.1, 0.9), times = 10) ) - out <- suppressMessages(score(data = data)) + out <- suppressWarnings(suppressMessages( + score(data = data, metrics = metrics_no_cov)) + ) metric_names <- c( "dispersion", "underprediction", "overprediction", - "bias", "ae_median", "coverage_deviation" + "bias", "ae_median" ) expect_true(all(metric_names %in% colnames(out))) @@ -227,27 +225,16 @@ test_that("all quantile and range formats yield the same result", { expect_equal(sort(eval1$ae_median), sort(ae)) }) -test_that("function produces output even if only some metrics are chosen", { - example <- scoringutils::example_quantile - - eval <- suppressMessages(score(example, metrics = "coverage")) - - expect_equal( - nrow(eval) > 1, - TRUE - ) -}) - test_that("WIS is the same with other metrics omitted or included", { eval <- suppressMessages(score(example_quantile, - metrics = "interval_score" + metrics = list("wis" = wis) )) eval2 <- scores_quantile expect_equal( - sum(eval$interval_score), - sum(eval2$interval_score) + sum(eval$wis), + sum(eval2$wis) ) }) diff --git a/tests/testthat/test-sharpness.R b/tests/testthat/test-sharpness.R deleted file mode 100644 index 12dcc4c9f..000000000 --- a/tests/testthat/test-sharpness.R +++ /dev/null @@ -1,7 +0,0 @@ -test_that("function throws an error when missing 'predicted'", { - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - mad_sample() - ) -}) diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R index dc3de70e3..dbd2cde4b 100644 --- a/tests/testthat/test-summarise_scores.R +++ b/tests/testthat/test-summarise_scores.R @@ -1,6 +1,4 @@ test_that("summarise_scores() works without any arguments", { - expect_true("quantile" %in% names(scores_quantile)) - summarised_scores <- summarise_scores(scores_quantile) expect_false("quantile" %in% names(summarised_scores)) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index a909c7d46..5d8b37c3f 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -36,6 +36,19 @@ test_that("get_protected_columns() returns the correct result", { }) +test_that("run_safely() works as expected", { + f <- function(x) {x} + expect_equal(run_safely(2, fun = f), 2) + expect_equal(run_safely(2, y = 3, fun = f), 2) + expect_warning( + run_safely(fun = f), + 'Function execution failed, returning NULL. Error: argument "x" is missing, with no default', + fixed = TRUE + ) + expect_equal(suppressWarnings(run_safely(y = 3, fun = f)), NULL) +}) + + # test_that("prediction_is_quantile() correctly identifies quantile predictions", { # data <- data.frame( # predicted = 1:3, diff --git a/tests/testthat/test-utils_data_handling.R b/tests/testthat/test-utils_data_handling.R index 6e53cdbaa..4521c1f73 100644 --- a/tests/testthat/test-utils_data_handling.R +++ b/tests/testthat/test-utils_data_handling.R @@ -22,7 +22,7 @@ test_that("range_long_to_quantile works", { -test_that("quantile_to_interval works", { +test_that("quantile_to_interval.data.frame() works", { quantile <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", @@ -30,7 +30,6 @@ test_that("quantile_to_interval works", { predicted = c(2:11, 4:13), quantile = rep(c(0.25, 0.75), each = 10) ) - long <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", @@ -39,19 +38,38 @@ test_that("quantile_to_interval works", { range = 50, boundary = rep(c("lower", "upper"), each = 10) ) - long2 <- as.data.frame(quantile_to_interval( quantile, keep_quantile_col = FALSE )) - data.table::setcolorder(long2, names(long)) - # for some reason this is needed to pass the unit tests on gh actions long2$boundary <- as.character(long2$boundary) long$boundary <- as.character(long$boundary) - expect_equal(long, as.data.frame(long2)) + + # check that it handles NA values + setDT(quantile) + quantile[c(1, 3, 11, 13), c("observed", "predicted", "quantile") := NA] + # in this instance, a problem appears because there is an NA value both + # for the upper and lower bound. + expect_message( + quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ), + "Aggregate function missing, defaulting to 'length'" + ) + quantile <- quantile[-c(1, 3), ] + wide2 <- scoringutils:::quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ) + expect_equal(nrow(wide2), 10) + expect_true(!("NA") %in% colnames(wide2)) + expect_equal(sum(wide2$lower, na.rm = TRUE), 59) }) @@ -72,7 +90,7 @@ test_that("sample_to_quantiles works", { predicted = rep(2:11, each = 2) + c(0, 2) ) - quantile2 <- scoringutils::sample_to_quantile(samples, quantiles = c(0.25, 0.75)) + quantile2 <- sample_to_quantile(samples, quantiles = c(0.25, 0.75)) expect_equal(quantile, as.data.frame(quantile2)) diff --git a/vignettes/scoring-forecasts-directly.Rmd b/vignettes/scoring-forecasts-directly.Rmd index d7a68736f..9272f5d27 100644 --- a/vignettes/scoring-forecasts-directly.Rmd +++ b/vignettes/scoring-forecasts-directly.Rmd @@ -207,7 +207,7 @@ observed <- rnorm(30, mean = 1:30) interval_range <- 90 alpha <- (100 - interval_range) / 100 lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) +upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 11:40)) interval_score( observed = observed, diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd index d5dc97bc9..4ae09b2ac 100644 --- a/vignettes/scoringutils.Rmd +++ b/vignettes/scoringutils.Rmd @@ -223,7 +223,6 @@ For quantile-based forecasts we are often interested in specific coverage-levels ```{r} score(example_quantile) %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores(by = c("model", "target_type")) %>% summarise_scores(fun = signif, digits = 2) ``` @@ -304,20 +303,20 @@ example_quantile[quantile %in% seq(0.1, 0.9, 0.1), ] %>% facet_grid(model ~ target_type) ``` -Another way to look at calibration are interval coverage and quantile coverage. Interval coverage is the percentage of true values that fall inside a given central prediction interval. Quantile coverage is the percentage of observed values that fall below a given quantile level. + -In order to plot interval coverage, you need to include "range" in the `by` argument to `summarise_scores()`. The green area on the plot marks conservative behaviour, i.e. your empirical coverage is greater than it nominally need be (e.g. 55% of true values covered by all 50% central prediction intervals.) + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "range")) %>% plot_interval_coverage() ``` -To visualise quantile coverage, you need to include "quantile" in `by`. Again, the green area corresponds to conservative forecasts, where central prediction intervals would cover more than needed. + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "quantile")) %>% @@ -377,11 +376,11 @@ example_quantile %>% plot_correlation() ``` -### Scores by interval ranges + -If you would like to see how different forecast interval ranges contribute to average scores, you can visualise scores by interval range: + -```{r} +```{r, eval = FALSE, include = FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "range", "target_type")) %>% @@ -400,8 +399,7 @@ example_integer %>% sample_to_quantile( quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) ) %>% - score() %>% - add_coverage(by = c("model", "target_type")) + score() ``` ## Available metrics