From 1f51495b4085aa3a4ce217593cc062404361896e Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Tue, 20 Aug 2024 10:00:31 -0400 Subject: [PATCH 01/31] initial draft of score_model_out --- NAMESPACE | 1 + R/score_model_out.R | 36 ++++++++++++++++++++++++++++++++++++ R/validate.R | 28 ++++++++++++++++++++++++++++ man/score_model_out.Rd | 30 ++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 R/score_model_out.R create mode 100644 man/score_model_out.Rd diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..f267329 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,3 @@ # Generated by roxygen2: do not edit by hand +export(score_model_out) diff --git a/R/score_model_out.R b/R/score_model_out.R new file mode 100644 index 0000000..d4b35ad --- /dev/null +++ b/R/score_model_out.R @@ -0,0 +1,36 @@ +#' Score model output predictions against observed data +#' +#' @param model_out_tbl Model output tibble with predictions +#' @param target_observations Observed 'ground truth' data to be compared against predictions +#' @param metrics Optional list of scoring metrics to computeq +#' @param output_type_id_order For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in +#' increasing order of the levels. For all other output types, this is ignored. +#' +#' @return forecast_quantile +#' +#' @export +score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, by = NULL, + output_type_id_order = NULL) { + # check that model_out_tbl has a single output_type that is supported by this package + output_type <- validate_output_type(model_out_tbl) + + # assemble data for scoringutils + if (output_type == "quantile") { + su_data <- transform_quantile_model_out(model_out_tbl, target_observations) + if (is.null(metrics)) { + metrics <- scoringutils::metrics_quantile() + } + } else if (output_type == "pmf") { + su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order) + if (is.null(metrics)) { + metrics <- scoringutils::metrics_nominal() + } + } else if (output_type %in% c("mean", "median")) { + su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type) + } + + # compute scores + scores <- scoringutils::score(su_data, metrics) + + return(scores) +} diff --git a/R/validate.R b/R/validate.R index 2f0cb13..e4efcf9 100644 --- a/R/validate.R +++ b/R/validate.R @@ -55,3 +55,31 @@ validate_model_out_target_obs <- function(model_out_tbl, target_observations) { return(model_out_tbl) } + + +#' Check that model_out_tble has a single `output_type` that is one of the +#' `output_types` that is supported by this function. +#' +#' @return if valid, the output_type in model_out_tbl +#' +#' @noRd +validate_output_type <- function(model_out_tbl) { + output_type <- unique(model_out_tbl$output_type) + if (length(output_type) != 1) { + cli::cli_abort( + "model_out_tbl must contain a single output_type, but it has multiple: + {.val {output_type}}" + ) + } + + supported_types <- c("mean", "median", "pmf", "quantile") + if (!output_type %in% supported_types) { + cli::cli_abort( + "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; + hubEvals currently only supports the following types: + {.val {supported_types}}" + ) + } + + return(output_type) +} diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd new file mode 100644 index 0000000..fb2c174 --- /dev/null +++ b/man/score_model_out.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score_model_out.R +\name{score_model_out} +\alias{score_model_out} +\title{Score model output predictions against observed data} +\usage{ +score_model_out( + model_out_tbl, + target_observations, + metrics = NULL, + by = NULL, + output_type_id_order = NULL +) +} +\arguments{ +\item{model_out_tbl}{Model output tibble with predictions} + +\item{target_observations}{Observed 'ground truth' data to be compared against predictions} + +\item{metrics}{Optional list of scoring metrics to computeq} + +\item{output_type_id_order}{For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in +increasing order of the levels. For all other output types, this is ignored.} +} +\value{ +forecast_quantile +} +\description{ +Score model output predictions against observed data +} From e101808fcd8a8bff8ea5fe6417b1b59b7004ebc2 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 22 Aug 2024 11:32:46 -0400 Subject: [PATCH 02/31] some partial progress on score_model_out --- R/score_model_out.R | 119 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 7 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index d4b35ad..b46dc6f 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -1,20 +1,46 @@ -#' Score model output predictions against observed data +#' Score model output predictions with a single `output_type` against observed data #' #' @param model_out_tbl Model output tibble with predictions -#' @param target_observations Observed 'ground truth' data to be compared against predictions -#' @param metrics Optional list of scoring metrics to computeq -#' @param output_type_id_order For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in -#' increasing order of the levels. For all other output types, this is ignored. +#' @param target_observations Observed 'ground truth' data to be compared to +#' predictions +#' @param metrics Optional list of scoring metrics to compute. See details for more. +#' @param summarize boolean indicator of whether summaries of forecast scores +#' should be computed. Defaults to `TRUE`. +#' @param by character vector naming columns to summarize by. For example, +#' specifying `by = "model_id"` will compute average scores for each model. The +#' default, `NULL`, will summarize across `output_type_id` levels (e.g., +#' quantile levels or categories for a pmf forecast) to produce a single row of +#' scores for each combination of model and prediction task. +#' @param output_type_id_order For ordinal variables in pmf format, this is a +#' vector of levels for pmf forecasts, in increasing order of the levels. For +#' all other output types, this is ignored. +#' +#' @details If `metrics` is `NULL` (the default), this function chooses +#' appropriate metrics based on the `output_type` contained in the `model_out_tbl`: +#' - For `output_type == "quantile"`, we use the default metrics provided by +#' `scoringutils::metrics_quantile()`: "wis", "overprediction", "underprediction", +#' "dispersion", "bias", "interval_coverage_50", "interval_coverage_90", +#' "interval_coverage_deviation", and "ae_median" +#' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating +#' that the predicted variable is a nominal variable), we use the default metrics +#' provided by `scoringutils::metrics_nominal()` +#' - For `output_type == "median"`, we use "ae_point" +#' - For `output_type == "mean"`, we use "se_point" #' #' @return forecast_quantile #' #' @export -score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, by = NULL, +score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, + summarize = TRUE, by = NULL, output_type_id_order = NULL) { # check that model_out_tbl has a single output_type that is supported by this package + # also, retrieve that output_type output_type <- validate_output_type(model_out_tbl) - # assemble data for scoringutils + # get/validate the scoring metrics + metrics <- get_metrics(metrics, output_type, output_type_id_order) + + # assemble data for scoringutils and select output_type-specific metrics if (output_type == "quantile") { su_data <- transform_quantile_model_out(model_out_tbl, target_observations) if (is.null(metrics)) { @@ -27,10 +53,89 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, } } else if (output_type %in% c("mean", "median")) { su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type) + if (is.null(metrics)) { + metrics <- scoringutils::metrics_point( + select = ifelse(output_type == "mean", "se_point", "ae_point") + ) + } } # compute scores scores <- scoringutils::score(su_data, metrics) + # switch back to hubverse convention of "model_id" rather than "model" + scores <- dplyr::rename(scores, model_id = "model") + + # if requested, summarize scores + if (summarize) { + scores <- scoringutils::summarize_scores(scores = scores, by = by) + } + return(scores) } + + +#' Get metrics if user didn't specify anything; otherwise, process +#' and validate user inputs +#' +#' @inheritParams score_model_out +#' +#' @return a list of metric functions as required by scoringutils::score() +#' +#' @noRd +get_metrics <- function(metrics, output_type, output_type_id_order) { + if (is.null(metrics)) { + return(get_metrics_default(output_type, output_type_id_order)) + } else if (is.character(metrics)) { + return(get_metrics_character(metrics)) + } else { + # We do a minimal preliminary check that the provided `metrics` is a list of + # functions, leaving detailed checks to scoringutils + checkmate::assert_list(metrics, types = "function") + return(metrics) + } +} + + +#' Default metrics if user didn't specify anything +#' +#' @inheritParams score_model_out +#' +#' @return a list of metric functions as required by scoringutils::score() +#' +#' @noRd +get_metrics_default <- function(output_type, output_type_id_order) { + if (output_type == "quantile") { + metrics <- scoringutils::metrics_quantile() + } else if (output_type == "pmf") { + metrics <- scoringutils::metrics_nominal() + } else if (output_type == "mean") { + metrics <- scoringutils::metrics_point(select = "se_point") + } else if (output_type == "median") { + metrics <- scoringutils::metrics_point(select = "ae_point") + } else { + # we have already validated `output_type`, so this case should not be + # triggered; this case is just double checking in case we add something new + # later. + supported_types <- c("mean", "median", "pmf", "quantile") + cli::cli_abort( + "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; + hubEvals currently only supports the following types: + {.val {supported_types}}" + ) + } + + return(metrics) +} + + +#' Convert character vector of metrics to list of functions +#' +#' @inheritParams score_model_out +#' +#' @return a list of metric functions as required by scoringutils::score() +#' +#' @noRd +get_metrics_character <- function(metrics) { + +} From 448e8303f63d602837e4d5e00f645212b2a2de55 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 22 Aug 2024 16:49:04 -0400 Subject: [PATCH 03/31] updates to score_model_out --- R/score_model_out.R | 136 +++++-- man/score_model_out.Rd | 68 +++- tests/testthat/test-score_model_out.R | 491 ++++++++++++++++++++++++++ 3 files changed, 664 insertions(+), 31 deletions(-) create mode 100644 tests/testthat/test-score_model_out.R diff --git a/R/score_model_out.R b/R/score_model_out.R index b46dc6f..c5ba9f2 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -7,10 +7,8 @@ #' @param summarize boolean indicator of whether summaries of forecast scores #' should be computed. Defaults to `TRUE`. #' @param by character vector naming columns to summarize by. For example, -#' specifying `by = "model_id"` will compute average scores for each model. The -#' default, `NULL`, will summarize across `output_type_id` levels (e.g., -#' quantile levels or categories for a pmf forecast) to produce a single row of -#' scores for each combination of model and prediction task. +#' specifying `by = "model_id"` (the default) will compute average scores for +#' each model. #' @param output_type_id_order For ordinal variables in pmf format, this is a #' vector of levels for pmf forecasts, in increasing order of the levels. For #' all other output types, this is ignored. @@ -23,15 +21,36 @@ #' "interval_coverage_deviation", and "ae_median" #' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating #' that the predicted variable is a nominal variable), we use the default metrics -#' provided by `scoringutils::metrics_nominal()` +#' provided by `scoringutils::metrics_nominal()`, currently just "log_score" #' - For `output_type == "median"`, we use "ae_point" #' - For `output_type == "mean"`, we use "se_point" #' +#' It is also possible to directly provide a list of metrics, e.g. as would be +#' created by one of those function calls. Alternatively, a character vector of +#' scoring metrics can be provided. In this case, the following options are supported: +#' - `output_type == "median"` and `output_type == "median"`: +#' - "ae": absolute error of a point prediction (generally recommended for the median) +#' - "se": squared error of a point prediction (generally recommended for the mean) +#' - `output_type == "quantile"`: +#' - "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5) +#' - "wis": weighted interval score (WIS) of a collection of quantile predictions +#' - "overprediction": The component of WIS measuring the extent to which +#' predictions fell above the observation. +#' - "underprediction": The component of WIS measuring the extent to which +#' predictions fell below the observation. +#' - "dispersion": The component of WIS measuring the dispersion of forecast +#' distributions. +#' - "interval_coverage_XX": interval coverage at the "XX" level. For example, +#' "interval_coverage_95" is the 95% interval coverage rate, which would be calculated +#' based on quantiles at the probability levels 0.025 and 0.975. +#' - `output_type == "pmf"`: +#' - "log_score": log score +#' #' @return forecast_quantile #' #' @export score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, - summarize = TRUE, by = NULL, + summarize = TRUE, by = "model_id", output_type_id_order = NULL) { # check that model_out_tbl has a single output_type that is supported by this package # also, retrieve that output_type @@ -43,28 +62,24 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, # assemble data for scoringutils and select output_type-specific metrics if (output_type == "quantile") { su_data <- transform_quantile_model_out(model_out_tbl, target_observations) - if (is.null(metrics)) { - metrics <- scoringutils::metrics_quantile() - } } else if (output_type == "pmf") { su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order) - if (is.null(metrics)) { - metrics <- scoringutils::metrics_nominal() - } } else if (output_type %in% c("mean", "median")) { su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type) - if (is.null(metrics)) { - metrics <- scoringutils::metrics_point( - select = ifelse(output_type == "mean", "se_point", "ae_point") - ) - } } # compute scores scores <- scoringutils::score(su_data, metrics) - # switch back to hubverse convention of "model_id" rather than "model" - scores <- dplyr::rename(scores, model_id = "model") + # switch back to hubverse naming conventions for model name + scores <- dplyr::rename( + scores, + model_id = "model" + ) + + # if present, drop predicted and observed columns + drop_cols <- c("predicted", "observed") + scores <- scores[!colnames(scores) %in% drop_cols] # if requested, summarize scores if (summarize) { @@ -87,10 +102,10 @@ get_metrics <- function(metrics, output_type, output_type_id_order) { if (is.null(metrics)) { return(get_metrics_default(output_type, output_type_id_order)) } else if (is.character(metrics)) { - return(get_metrics_character(metrics)) + return(get_metrics_character(metrics, output_type)) } else { # We do a minimal preliminary check that the provided `metrics` is a list of - # functions, leaving detailed checks to scoringutils + # functions, leaving further checks to scoringutils checkmate::assert_list(metrics, types = "function") return(metrics) } @@ -116,7 +131,7 @@ get_metrics_default <- function(output_type, output_type_id_order) { } else { # we have already validated `output_type`, so this case should not be # triggered; this case is just double checking in case we add something new - # later. + # later, to ensure we update this function. supported_types <- c("mean", "median", "pmf", "quantile") cli::cli_abort( "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; @@ -136,6 +151,79 @@ get_metrics_default <- function(output_type, output_type_id_order) { #' @return a list of metric functions as required by scoringutils::score() #' #' @noRd -get_metrics_character <- function(metrics) { - +get_metrics_character <- function(metrics, output_type) { + if (output_type == "quantile") { + # split into metrics for interval coverage and others + interval_metric_inds <- grepl(pattern = "^interval_coverage_[[:digit:]][[:digit:]]$", metrics) + interval_metrics <- metrics[interval_metric_inds] + other_metrics <- metrics[!interval_metric_inds] + + # validate metrics + valid_metrics <- c("ae_median", "wis", "overprediction", "underprediction", "dispersion") + invalid_metrics <- other_metrics[!other_metrics %in% valid_metrics] + valid_metrics <- c(valid_metrics, "interval_coverage_XY") + if (length(invalid_metrics) > 0) { + cli::cli_abort( + c( + "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", + " `output_type` {.val {output_type}}: {.val {invalid_metrics}};", + " supported metrics include {.val {valid_metrics}}", + " where `XY` denotes the coverage level, e.g. \"interval_coverage_95\"." + ) + ) + } + + # assemble metric functions + interval_metric_fns <- lapply( + interval_metrics, + function(metric) { + level <- as.integer(substr(metric, 19, 20)) + return(purrr::partial(scoringutils::interval_coverage, interval_range = level)) + } + ) + names(interval_metric_fns) <- interval_metrics + + other_metric_fns <- scoringutils::metrics_quantile(select = other_metrics) + + metric_fns <- c(other_metric_fns, interval_metric_fns)[metrics] + return(metric_fns) + } else if (output_type == "pmf") { + valid_metrics <- c("log_score") + invalid_metrics <- metrics[!metrics %in% valid_metrics] + if (length(invalid_metrics) > 0) { + cli::cli_abort( + c( + "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", + " `output_type` {.val{output_type}}: {.val {invalid_metrics}};", + " supported metrics include {.val {valid_metrics}}." + ) + ) + } + metrics <- scoringutils::metrics_nominal(select = metrics) + } else if (output_type %in% c("median", "mean")) { + valid_metrics <- c("ae_point", "se_point") + invalid_metrics <- metrics[!metrics %in% valid_metrics] + if (length(invalid_metrics) > 0) { + cli::cli_abort( + c( + "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", + " `output_type` {.val{output_type}}: {.val {invalid_metrics}};", + " supported metrics include {.val {valid_metrics}}." + ) + ) + } + metrics <- scoringutils::metrics_point(select = metrics) + } else { + # we have already validated `output_type`, so this case should not be + # triggered; this case is just double checking in case we add something new + # later, to ensure we update this function. + supported_types <- c("mean", "median", "pmf", "quantile") + cli::cli_abort( + "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; + hubEvals currently only supports the following types: + {.val {supported_types}}" + ) + } + + return(metrics) } diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd index fb2c174..69b31dc 100644 --- a/man/score_model_out.Rd +++ b/man/score_model_out.Rd @@ -2,29 +2,83 @@ % Please edit documentation in R/score_model_out.R \name{score_model_out} \alias{score_model_out} -\title{Score model output predictions against observed data} +\title{Score model output predictions with a single \code{output_type} against observed data} \usage{ score_model_out( model_out_tbl, target_observations, metrics = NULL, - by = NULL, + summarize = TRUE, + by = "model_id", output_type_id_order = NULL ) } \arguments{ \item{model_out_tbl}{Model output tibble with predictions} -\item{target_observations}{Observed 'ground truth' data to be compared against predictions} +\item{target_observations}{Observed 'ground truth' data to be compared to +predictions} -\item{metrics}{Optional list of scoring metrics to computeq} +\item{metrics}{Optional list of scoring metrics to compute. See details for more.} -\item{output_type_id_order}{For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in -increasing order of the levels. For all other output types, this is ignored.} +\item{summarize}{boolean indicator of whether summaries of forecast scores +should be computed. Defaults to \code{TRUE}.} + +\item{by}{character vector naming columns to summarize by. For example, +specifying \code{by = "model_id"} (the default) will compute average scores for +each model.} + +\item{output_type_id_order}{For ordinal variables in pmf format, this is a +vector of levels for pmf forecasts, in increasing order of the levels. For +all other output types, this is ignored.} } \value{ forecast_quantile } \description{ -Score model output predictions against observed data +Score model output predictions with a single \code{output_type} against observed data +} +\details{ +If \code{metrics} is \code{NULL} (the default), this function chooses +appropriate metrics based on the \code{output_type} contained in the \code{model_out_tbl}: +\itemize{ +\item For \code{output_type == "quantile"}, we use the default metrics provided by +\code{scoringutils::metrics_quantile()}: "wis", "overprediction", "underprediction", +"dispersion", "bias", "interval_coverage_50", "interval_coverage_90", +"interval_coverage_deviation", and "ae_median" +\item For \code{output_type == "pmf"} and \code{output_type_id_order} is \code{NULL} (indicating +that the predicted variable is a nominal variable), we use the default metrics +provided by \code{scoringutils::metrics_nominal()}, currently just "log_score" +\item For \code{output_type == "median"}, we use "ae_point" +\item For \code{output_type == "mean"}, we use "se_point" +} + +It is also possible to directly provide a list of metrics, e.g. as would be +created by one of those function calls. Alternatively, a character vector of +scoring metrics can be provided. In this case, the following options are supported: +\itemize{ +\item \code{output_type == "median"} and \code{output_type == "median"}: +\itemize{ +\item "ae": absolute error of a point prediction (generally recommended for the median) +\item "se": squared error of a point prediction (generally recommended for the mean) +} +\item \code{output_type == "quantile"}: +\itemize{ +\item "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5) +\item "wis": weighted interval score (WIS) of a collection of quantile predictions +\item "overprediction": The component of WIS measuring the extent to which +predictions fell above the observation. +\item "underprediction": The component of WIS measuring the extent to which +predictions fell below the observation. +\item "dispersion": The component of WIS measuring the dispersion of forecast +distributions. +\item "interval_coverage_XX": interval coverage at the "XX" level. For example, +"interval_coverage_95" is the 95\% interval coverage rate, which would be calculated +based on quantiles at the probability levels 0.025 and 0.975. +} +\item \code{output_type == "pmf"}: +\itemize{ +\item "log_score": log score +} +} } diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R new file mode 100644 index 0000000..9f18d4a --- /dev/null +++ b/tests/testthat/test-score_model_out.R @@ -0,0 +1,491 @@ +test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, by all", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), + target_observations = forecast_target_observations, + by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "mean") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "mean"), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + se = (.data[["value"]] - .data[["observation"]])^2 + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ))) |> + dplyr::summarize( + se_point = mean(.data[["se"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$se_point.x, merged_scores$se_point.y) +}) + + +test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, summarize FALSE", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), + target_observations = forecast_target_observations, + summarize = FALSE + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "mean") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "mean"), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + se = (.data[["value"]] - .data[["observation"]])^2 + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ))) |> + dplyr::summarize( + se_point = mean(.data[["se"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$se_point.x, merged_scores$se_point.y) +}) + + +test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), + target_observations = forecast_target_observations, + metrics = c("ae_point", "se_point"), + by = c("model_id", "location") + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "mean") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "mean"), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + ae = abs(.data[["value"]] - .data[["observation"]]), + se = (.data[["value"]] - .data[["observation"]])^2 + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location") + ))) |> + dplyr::summarize( + ae_point = mean(.data[["ae"]]), + se_point = mean(.data[["se"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y) + expect_equal(merged_scores$se_point.x, merged_scores$se_point.y) +}) + + +test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), + target_observations = forecast_target_observations, + metrics = scoringutils::metrics_point(), + by = c("model_id", "location") + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "mean") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "mean"), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + ae = abs(.data[["value"]] - .data[["observation"]]), + se = (.data[["value"]] - .data[["observation"]])^2, + pe = abs(.data[["value"]] - .data[["observation"]]) / abs(.data[["observation"]]) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location") + ))) |> + dplyr::summarize( + ae_point = mean(.data[["ae"]]), + se_point = mean(.data[["se"]]), + ape = mean(.data[["pe"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y) + expect_equal(merged_scores$se_point.x, merged_scores$se_point.y) + expect_equal(merged_scores$ape.x, merged_scores$ape.y) +}) + + +test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "median"), + target_observations = forecast_target_observations, + summarize = FALSE + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "median") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "median"), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + ae = abs(.data[["value"]] - .data[["observation"]]) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ))) |> + dplyr::summarize( + ae_point = mean(.data[["ae"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y) +}) + + +test_that("score_model_out succeeds with valid inputs: quantile output_type, wis and interval metrics, custom by", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), + by = c("model_id", "location") + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "quantile") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "quantile") |> + dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + output_type_id = as.numeric(.data[["output_type_id"]]), + qs = ifelse( + .data[["observation"]] >= .data[["value"]], + .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]), + (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]]) + ), + q_coverage_80_lower = ifelse( + .data[["output_type_id"]] == 0.1, + .data[["observation"]] >= .data[["value"]], + NA_real_ + ), + q_coverage_80_upper = ifelse( + .data[["output_type_id"]] == 0.9, + .data[["observation"]] <= .data[["value"]], + NA_real_ + ), + q_coverage_90_lower = ifelse( + .data[["output_type_id"]] == 0.05, + .data[["observation"]] >= .data[["value"]], + NA_real_ + ), + q_coverage_90_upper = ifelse( + .data[["output_type_id"]] == 0.95, + .data[["observation"]] <= .data[["value"]], + NA_real_ + ) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ))) |> + dplyr::summarize( + wis = 2 * mean(.data[["qs"]]), + interval_coverage_80 = (sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) * + (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1), + interval_coverage_90 = (sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) * + (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location") + ))) |> + dplyr::summarize( + wis = mean(.data[["wis"]]), + interval_coverage_80 = mean(.data[["interval_coverage_80"]], na.rm = TRUE), + interval_coverage_90 = mean(.data[["interval_coverage_90"]], na.rm = TRUE), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$wis.x, merged_scores$wis.y) + expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y) + expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y) +}) + + +test_that("score_model_out succeeds with valid inputs: quantile output_type, wis/interval metrics, summarize FALSE", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), + summarize = FALSE + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "quantile") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "quantile") |> + dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))), + by = c("location", "target_end_date", "target") + ) |> + dplyr::mutate( + output_type_id = as.numeric(.data[["output_type_id"]]), + qs = ifelse( + .data[["observation"]] >= .data[["value"]], + .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]), + (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]]) + ), + q_coverage_80_lower = ifelse( + .data[["output_type_id"]] == 0.1, + .data[["observation"]] >= .data[["value"]], + NA_real_ + ), + q_coverage_80_upper = ifelse( + .data[["output_type_id"]] == 0.9, + .data[["observation"]] <= .data[["value"]], + NA_real_ + ), + q_coverage_90_lower = ifelse( + .data[["output_type_id"]] == 0.05, + .data[["observation"]] >= .data[["value"]], + NA_real_ + ), + q_coverage_90_upper = ifelse( + .data[["output_type_id"]] == 0.95, + .data[["observation"]] <= .data[["value"]], + NA_real_ + ) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ))) |> + dplyr::summarize( + wis = 2 * mean(.data[["qs"]]), + interval_coverage_80 = as.logical((sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) * + (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1)), + interval_coverage_90 = as.logical((sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) * + (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)) + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$wis.x, merged_scores$wis.y) + expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y) + expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y) +}) + + +test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, default metrics, custom by", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + act_scores <- score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "pmf"), + target_observations = forecast_target_observations, + by = c("model_id", "location") + ) + + exp_scores <- forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "pmf") |> + dplyr::left_join( + forecast_target_observations |> + dplyr::filter(.data[["output_type"]] == "pmf") |> + dplyr::select(-dplyr::all_of(c("output_type"))), + by = c("location", "target_end_date", "target", "output_type_id") + ) |> + dplyr::filter(.data[["observation"]] == 1) |> + dplyr::mutate( + log_score = -1 * log(.data[["value"]]) + ) |> + dplyr::group_by(dplyr::across(dplyr::all_of( + c("model_id", "location") + ))) |> + dplyr::summarize( + log_score = mean(.data[["log_score"]]), + .groups = "drop" + ) + + # same column names, number of rows, and score values + expect_equal(colnames(act_scores), colnames(exp_scores)) + expect_equal(nrow(act_scores), nrow(exp_scores)) + merged_scores <- dplyr::full_join( + act_scores, exp_scores, + by = c("model_id", "location") + ) + expect_equal(nrow(act_scores), nrow(merged_scores)) + expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y) +}) + + +test_that("score_model_out errors when model_out_tbl has multiple output_types", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs, + target_observations = forecast_target_observations + ), + regexp = "model_out_tbl must contain a single output_type, but it has multiple" + ) +}) + + +test_that("score_model_out errors when invalid interval levels are requested", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = "interval_level_5" + ), + regexp = "unsupported metric" + ) + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = "interval_level_100" + ), + regexp = "unsupported metric" + ) + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = "interval_level_XY" + ), + regexp = "unsupported metric" + ) +}) + + +test_that("score_model_out errors when invalid metrics are requested", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = "log_score" + ), + regexp = "unsupported metric" + ) + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = forecast_target_observations, + metrics = list(5, 6, "asdf") + ), + regexp = "Assertion on 'metrics' failed" + ) +}) + + + +test_that("score_model_out errors when an unsupported output_type is provided", { + load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs + load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "cdf"), + target_observations = forecast_target_observations, + metrics = "log_score" + ), + regexp = "only supports the following types" + ) +}) From b92891a1953988cc893fb27256998694ee1527e8 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 22 Aug 2024 16:52:56 -0400 Subject: [PATCH 04/31] appease the linter --- R/score_model_out.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index c5ba9f2..6fdcf4f 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -132,7 +132,7 @@ get_metrics_default <- function(output_type, output_type_id_order) { # we have already validated `output_type`, so this case should not be # triggered; this case is just double checking in case we add something new # later, to ensure we update this function. - supported_types <- c("mean", "median", "pmf", "quantile") + supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter cli::cli_abort( "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; hubEvals currently only supports the following types: @@ -217,7 +217,7 @@ get_metrics_character <- function(metrics, output_type) { # we have already validated `output_type`, so this case should not be # triggered; this case is just double checking in case we add something new # later, to ensure we update this function. - supported_types <- c("mean", "median", "pmf", "quantile") + supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter cli::cli_abort( "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; hubEvals currently only supports the following types: From d6a6b362bfd994f51328f6540aa0194744e7e003 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 22 Aug 2024 16:54:02 -0400 Subject: [PATCH 05/31] update package imports --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 66558a6..387e8e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,10 +38,12 @@ Description: basic tools for scoring hubverse forecasts. License: MIT + file LICENSE Encoding: UTF-8 -Imports: +Imports: + checkmate, cli, dplyr, hubUtils, + purrr, rlang, scoringutils (>= 1.2.2.9000) Remotes: From 28724e4d9da1e8ecf5eb6bb3281c534b4a69da2b Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:41:39 -0400 Subject: [PATCH 06/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 9f18d4a..d1704d7 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -1,4 +1,5 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, by all", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 41c8c509dbb5cb46f4a398ce2dd13fcdbf3b8624 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:41:58 -0400 Subject: [PATCH 07/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index d1704d7..8585c32 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -40,6 +40,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, summarize FALSE", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From fd06ad7271c222270fa195004ac36879e536990b Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:42:06 -0400 Subject: [PATCH 08/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 8585c32..5fd43f3 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -81,6 +81,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From fa64a8eda9d2df8f31d1afdf81ee6365624045c0 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:42:12 -0400 Subject: [PATCH 09/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 5fd43f3..15a05b8 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -126,6 +126,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, charact test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 7e3ce73d04e1c72ad185124cbdd85d54199c6479 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:42:25 -0400 Subject: [PATCH 10/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 15a05b8..6d16b4e 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -174,6 +174,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, functio test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 5b8eb40dd0d349cf4f9315cdc6b9f9d342ece82f Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:51:20 -0400 Subject: [PATCH 11/31] Update R/score_model_out.R Co-authored-by: Zhian N. Kamvar --- R/score_model_out.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index 6fdcf4f..6d51a39 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -60,13 +60,13 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, metrics <- get_metrics(metrics, output_type, output_type_id_order) # assemble data for scoringutils and select output_type-specific metrics - if (output_type == "quantile") { - su_data <- transform_quantile_model_out(model_out_tbl, target_observations) - } else if (output_type == "pmf") { - su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order) - } else if (output_type %in% c("mean", "median")) { - su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type) - } + su_data <- switch(output_type, + quantile = transform_quantile_model_out(model_out_tbl, target_observations), + pmf = transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order), + mean = transform_point_model_out(model_out_tbl, target_observations, output_type), + median = transform_point_model_out(model_out_tbl, target_observations, output_type), + NULL # default, should not happen because of the validation above + ) # compute scores scores <- scoringutils::score(su_data, metrics) From a68fabbfc7e71a15e3900a46ec3e2dc8202c9b4c Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 09:54:42 -0400 Subject: [PATCH 12/31] Update R/score_model_out.R Co-authored-by: Zhian N. Kamvar --- R/score_model_out.R | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index 6d51a39..d6d9a78 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -120,15 +120,14 @@ get_metrics <- function(metrics, output_type, output_type_id_order) { #' #' @noRd get_metrics_default <- function(output_type, output_type_id_order) { - if (output_type == "quantile") { - metrics <- scoringutils::metrics_quantile() - } else if (output_type == "pmf") { - metrics <- scoringutils::metrics_nominal() - } else if (output_type == "mean") { - metrics <- scoringutils::metrics_point(select = "se_point") - } else if (output_type == "median") { - metrics <- scoringutils::metrics_point(select = "ae_point") - } else { + metrics <- switch(output_type, + quantile = scoringutils::metrics_quantile(), + pmf = scoringutils::metrics_nominal(), + mean = scoringutils::metrics_point(select = "se_point"), + median = scoringutils::metrics_point(select = "ae_point"), + NULL # default + ) + if (is.null(metrics)) { # we have already validated `output_type`, so this case should not be # triggered; this case is just double checking in case we add something new # later, to ensure we update this function. From a6a9c3b3d6695afe84725dc35f63f30b47bc4c84 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:00:01 -0400 Subject: [PATCH 13/31] Update R/score_model_out.R Co-authored-by: Zhian N. Kamvar --- R/score_model_out.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index d6d9a78..8993741 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -185,7 +185,7 @@ get_metrics_character <- function(metrics, output_type) { other_metric_fns <- scoringutils::metrics_quantile(select = other_metrics) metric_fns <- c(other_metric_fns, interval_metric_fns)[metrics] - return(metric_fns) + metrics <- metric_fns } else if (output_type == "pmf") { valid_metrics <- c("log_score") invalid_metrics <- metrics[!metrics %in% valid_metrics] From 7c8e86d59dce8fa960eba79750f4c2d5962eb5c3 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:11:35 -0400 Subject: [PATCH 14/31] Update R/score_model_out.R Co-authored-by: Zhian N. Kamvar --- R/score_model_out.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index 8993741..ea4877b 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -192,8 +192,8 @@ get_metrics_character <- function(metrics, output_type) { if (length(invalid_metrics) > 0) { cli::cli_abort( c( - "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", - " `output_type` {.val{output_type}}: {.val {invalid_metrics}};", + "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for + `output_type` {.val {output_type}}: {.val {invalid_metrics}};", " supported metrics include {.val {valid_metrics}}." ) ) From 47f29a297916fd6908bc57358fa6adac31251a6b Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:13:22 -0400 Subject: [PATCH 15/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 6d16b4e..f417e0d 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -215,6 +215,7 @@ test_that("score_model_out succeeds with valid inputs: median output_type, defau test_that("score_model_out succeeds with valid inputs: quantile output_type, wis and interval metrics, custom by", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From fed85acc8b06ed9d31b5cd40b5f977b90007ec3d Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:13:40 -0400 Subject: [PATCH 16/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index f417e0d..3890d08 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -369,6 +369,7 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, default metrics, custom by", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 4c6eabca84823db65f1077287ccadc64254c32e4 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:13:53 -0400 Subject: [PATCH 17/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 3890d08..269c879 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -484,6 +484,7 @@ test_that("score_model_out errors when invalid metrics are requested", { test_that("score_model_out errors when an unsupported output_type is provided", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 6c443e6c823f958e1aaf4d1bf883fd37a1c0b3e5 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:14:08 -0400 Subject: [PATCH 18/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 269c879..0c0276a 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -459,6 +459,7 @@ test_that("score_model_out errors when invalid interval levels are requested", { test_that("score_model_out errors when invalid metrics are requested", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 778be41cc0b1a35f9339ac13a207fb6fea91df4c Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:14:29 -0400 Subject: [PATCH 19/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 0c0276a..05ba525 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -412,6 +412,7 @@ test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, test_that("score_model_out errors when model_out_tbl has multiple output_types", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From f27b164afb888d1242f88f5c55f88e64c9d13054 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:14:49 -0400 Subject: [PATCH 20/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 05ba525..7533f78 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -427,6 +427,7 @@ test_that("score_model_out errors when model_out_tbl has multiple output_types", test_that("score_model_out errors when invalid interval levels are requested", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 23675188f62b9d1540f2be233b5a64d61b1e7ba3 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:14:59 -0400 Subject: [PATCH 21/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 7533f78..17ab677 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -297,6 +297,7 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis test_that("score_model_out succeeds with valid inputs: quantile output_type, wis/interval metrics, summarize FALSE", { + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 2c740d710ce74049b3a3dc6391fbc6b55a20dabf Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:15:50 -0400 Subject: [PATCH 22/31] Update tests/testthat/test-score_model_out.R Co-authored-by: Zhian N. Kamvar --- tests/testthat/test-score_model_out.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index 17ab677..b8f8300 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -352,7 +352,8 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis interval_coverage_80 = as.logical((sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) * (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1)), interval_coverage_90 = as.logical((sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) * - (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)) + (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)), + .groups = "drop" ) # same column names, number of rows, and score values From 79a48285b628ce5ead6243c1549d4fe5ffb2457d Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:32:03 -0400 Subject: [PATCH 23/31] updates from review: document lack of checks for statistical validity of metric functions, split out validation for output_type --- R/score_model_out.R | 18 +++++++++--------- R/validate.R | 9 +++++++-- man/score_model_out.Rd | 11 ++++++++--- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index ea4877b..2d97b84 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -25,9 +25,8 @@ #' - For `output_type == "median"`, we use "ae_point" #' - For `output_type == "mean"`, we use "se_point" #' -#' It is also possible to directly provide a list of metrics, e.g. as would be -#' created by one of those function calls. Alternatively, a character vector of -#' scoring metrics can be provided. In this case, the following options are supported: +#' Alternatively, a character vector of scoring metrics can be provided. In this +#' case, the following options are supported: #' - `output_type == "median"` and `output_type == "median"`: #' - "ae": absolute error of a point prediction (generally recommended for the median) #' - "se": squared error of a point prediction (generally recommended for the mean) @@ -46,6 +45,12 @@ #' - `output_type == "pmf"`: #' - "log_score": log score #' +#' For more flexibility, it is also possible to directly provide a list of +#' functions to compute the desired metrics, e.g. as would be created by one of +#' the `scoringutils::metrics_*` methods. Note that in this case, `hubEvals` +#' only validates that a list of functions has been provided; no checks for the +#' statistical validity of these metric functions are done. +#' #' @return forecast_quantile #' #' @export @@ -216,12 +221,7 @@ get_metrics_character <- function(metrics, output_type) { # we have already validated `output_type`, so this case should not be # triggered; this case is just double checking in case we add something new # later, to ensure we update this function. - supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter - cli::cli_abort( - "Provided `model_out_tbl` contains `output_type` {.val {output_type}}; - hubEvals currently only supports the following types: - {.val {supported_types}}" - ) + error_if_invalid_output_type(output_type) } return(metrics) diff --git a/R/validate.R b/R/validate.R index 0f00d61..07db0c7 100644 --- a/R/validate.R +++ b/R/validate.R @@ -83,6 +83,13 @@ validate_output_type <- function(model_out_tbl) { ) } + error_if_invalid_output_type(output_type) + + return(output_type) +} + + +error_if_invalid_output_type <- function(output_type) { supported_types <- c("mean", "median", "pmf", "quantile") if (!output_type %in% supported_types) { cli::cli_abort( @@ -91,6 +98,4 @@ validate_output_type <- function(model_out_tbl) { {.val {supported_types}}" ) } - - return(output_type) } diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd index 69b31dc..6fcd864 100644 --- a/man/score_model_out.Rd +++ b/man/score_model_out.Rd @@ -53,9 +53,8 @@ provided by \code{scoringutils::metrics_nominal()}, currently just "log_score" \item For \code{output_type == "mean"}, we use "se_point" } -It is also possible to directly provide a list of metrics, e.g. as would be -created by one of those function calls. Alternatively, a character vector of -scoring metrics can be provided. In this case, the following options are supported: +Alternatively, a character vector of scoring metrics can be provided. In this +case, the following options are supported: \itemize{ \item \code{output_type == "median"} and \code{output_type == "median"}: \itemize{ @@ -81,4 +80,10 @@ based on quantiles at the probability levels 0.025 and 0.975. \item "log_score": log score } } + +For more flexibility, it is also possible to directly provide a list of +functions to compute the desired metrics, e.g. as would be created by one of +the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEvals} +only validates that a list of functions has been provided; no checks for the +statistical validity of these metric functions are done. } From 6e09adc24d9bcbd97cf3c4c91747e132ff2e9fba Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:35:24 -0400 Subject: [PATCH 24/31] fix up comment spacing --- tests/testthat/test-score_model_out.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index b8f8300..a77c764 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -81,7 +81,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", { - # Forecast data from HubExamples: + # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations From 5d45a2ac8cd9e849359b8ce877e1e05de73db0a9 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 10:50:27 -0400 Subject: [PATCH 25/31] refactor metric validation messaging --- R/score_model_out.R | 54 +++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index 2d97b84..7d93901 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -165,17 +165,12 @@ get_metrics_character <- function(metrics, output_type) { # validate metrics valid_metrics <- c("ae_median", "wis", "overprediction", "underprediction", "dispersion") invalid_metrics <- other_metrics[!other_metrics %in% valid_metrics] - valid_metrics <- c(valid_metrics, "interval_coverage_XY") - if (length(invalid_metrics) > 0) { - cli::cli_abort( - c( - "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", - " `output_type` {.val {output_type}}: {.val {invalid_metrics}};", - " supported metrics include {.val {valid_metrics}}", - " where `XY` denotes the coverage level, e.g. \"interval_coverage_95\"." - ) - ) - } + error_if_invalid_metrics( + valid_metrics = c(valid_metrics, "interval_coverage_XY"), + invalid_metrics = invalid_metrics, + output_type = output_type, + comment = c("i" = "NOTE: `XY` denotes the coverage level, e.g. {.val interval_coverage_95}.") + ) # assemble metric functions interval_metric_fns <- lapply( @@ -194,28 +189,14 @@ get_metrics_character <- function(metrics, output_type) { } else if (output_type == "pmf") { valid_metrics <- c("log_score") invalid_metrics <- metrics[!metrics %in% valid_metrics] - if (length(invalid_metrics) > 0) { - cli::cli_abort( - c( - "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for - `output_type` {.val {output_type}}: {.val {invalid_metrics}};", - " supported metrics include {.val {valid_metrics}}." - ) - ) - } + error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type) + metrics <- scoringutils::metrics_nominal(select = metrics) } else if (output_type %in% c("median", "mean")) { valid_metrics <- c("ae_point", "se_point") invalid_metrics <- metrics[!metrics %in% valid_metrics] - if (length(invalid_metrics) > 0) { - cli::cli_abort( - c( - "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for", - " `output_type` {.val{output_type}}: {.val {invalid_metrics}};", - " supported metrics include {.val {valid_metrics}}." - ) - ) - } + error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type) + metrics <- scoringutils::metrics_point(select = metrics) } else { # we have already validated `output_type`, so this case should not be @@ -226,3 +207,18 @@ get_metrics_character <- function(metrics, output_type) { return(metrics) } + + +error_if_invalid_metrics <- function(valid_metrics, invalid_metrics, output_type, comment = NULL) { + n <- length(invalid_metrics) + if (n > 0) { + cli::cli_abort( + c( + "`metrics` had {n} unsupported metric{?s} for + {.arg output_type} {.val {output_type}}: {.strong {.val {invalid_metrics}}}; + supported metrics include {.val {valid_metrics}}.", + comment + ) + ) + } +} From ba70640318ef09da2db117ce91dcb61d199f5d33 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Mon, 26 Aug 2024 12:32:18 -0400 Subject: [PATCH 26/31] add examples for score_model_out --- DESCRIPTION | 2 ++ R/score_model_out.R | 33 ++++++++++++++++++++++++++++----- man/score_model_out.Rd | 28 ++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 387e8e9..c711112 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -48,6 +48,7 @@ Imports: scoringutils (>= 1.2.2.9000) Remotes: epiforecasts/scoringutils, + hubverse-org/hubExamples, hubverse-org/hubUtils Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 @@ -57,5 +58,6 @@ Depends: LazyData: true Config/Needs/website: hubverse-org/hubStyle Suggests: + hubExamples, testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/R/score_model_out.R b/R/score_model_out.R index 7d93901..b98fe8d 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -51,6 +51,32 @@ #' only validates that a list of functions has been provided; no checks for the #' statistical validity of these metric functions are done. #' +#' @examplesIf requireNamespace("hubExamples", quietly = TRUE) +#' # compute WIS and interval coverage rates at 80% and 90% levels based on +#' # quantile forecasts, summarized by the mean score for each model +#' quantile_scores <- score_model_out( +#' model_out_tbl = hubExamples::forecast_outputs |> +#' dplyr::filter(.data[["output_type"]] == "quantile"), +#' target_observations = hubExamples::forecast_target_observations, +#' metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), +#' by = c("model_id") +#' ) +#' quantile_scores +#' +#' # compute log scores based on pmf predictions for categorical targets, +#' # summarized by the mean score for each combination of model and location. +#' # Note: if the model_out_tbl had forecasts for multiple targets using a +#' # pmf output_type with different bins, it would be necessary to score the +#' # predictions for those targets separately. +#' pmf_scores <- score_model_out( +#' model_out_tbl = hubExamples::forecast_outputs |> +#' dplyr::filter(.data[["output_type"]] == "pmf"), +#' target_observations = hubExamples::forecast_target_observations, +#' metrics = "log_score", +#' by = c("model_id", "location", "horizon") +#' ) +#' head(pmf_scores) +#' #' @return forecast_quantile #' #' @export @@ -64,7 +90,7 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, # get/validate the scoring metrics metrics <- get_metrics(metrics, output_type, output_type_id_order) - # assemble data for scoringutils and select output_type-specific metrics + # assemble data for scoringutils su_data <- switch(output_type, quantile = transform_quantile_model_out(model_out_tbl, target_observations), pmf = transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order), @@ -77,10 +103,7 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, scores <- scoringutils::score(su_data, metrics) # switch back to hubverse naming conventions for model name - scores <- dplyr::rename( - scores, - model_id = "model" - ) + scores <- dplyr::rename(scores, model_id = "model") # if present, drop predicted and observed columns drop_cols <- c("predicted", "observed") diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd index 6fcd864..15fefb2 100644 --- a/man/score_model_out.Rd +++ b/man/score_model_out.Rd @@ -87,3 +87,31 @@ the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEva only validates that a list of functions has been provided; no checks for the statistical validity of these metric functions are done. } +\examples{ +\dontshow{if (requireNamespace("hubExamples", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# compute WIS and interval coverage rates at 80\% and 90\% levels based on +# quantile forecasts, summarized by the mean score for each model +quantile_scores <- score_model_out( + model_out_tbl = hubExamples::forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "quantile"), + target_observations = hubExamples::forecast_target_observations, + metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), + by = c("model_id") +) +quantile_scores + +# compute log scores based on pmf predictions for categorical targets, +# summarized by the mean score for each combination of model and location. +# Note: if the model_out_tbl had forecasts for multiple targets using a +# pmf output_type with different bins, it would be necessary to score the +# predictions for those targets separately. +pmf_scores <- score_model_out( + model_out_tbl = hubExamples::forecast_outputs |> + dplyr::filter(.data[["output_type"]] == "pmf"), + target_observations = hubExamples::forecast_target_observations, + metrics = "log_score", + by = c("model_id", "location", "horizon") +) +head(pmf_scores) +\dontshow{\}) # examplesIf} +} From 7df18036d9fb640bfcc3f2275ebf10a3de630e05 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Wed, 28 Aug 2024 11:50:09 -0400 Subject: [PATCH 27/31] Update R/score_model_out.R Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com> --- R/score_model_out.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index b98fe8d..c6748fa 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -4,7 +4,7 @@ #' @param target_observations Observed 'ground truth' data to be compared to #' predictions #' @param metrics Optional list of scoring metrics to compute. See details for more. -#' @param summarize boolean indicator of whether summaries of forecast scores +#' @param summarize Boolean indicator of whether summaries of forecast scores #' should be computed. Defaults to `TRUE`. #' @param by character vector naming columns to summarize by. For example, #' specifying `by = "model_id"` (the default) will compute average scores for From a9c27956fed4cf1b477f07fe2f2fe819b3f4b41f Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Wed, 28 Aug 2024 11:50:14 -0400 Subject: [PATCH 28/31] Update R/score_model_out.R Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com> --- R/score_model_out.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index c6748fa..6be76e5 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -6,7 +6,7 @@ #' @param metrics Optional list of scoring metrics to compute. See details for more. #' @param summarize Boolean indicator of whether summaries of forecast scores #' should be computed. Defaults to `TRUE`. -#' @param by character vector naming columns to summarize by. For example, +#' @param by Character vector naming columns to summarize by. For example, #' specifying `by = "model_id"` (the default) will compute average scores for #' each model. #' @param output_type_id_order For ordinal variables in pmf format, this is a From 18ea11aa4561e80654bef58748003679145c4ca7 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 29 Aug 2024 11:51:18 -0400 Subject: [PATCH 29/31] updates to score_model_out docs --- R/score_model_out.R | 29 +++++++++++++++-------------- man/score_model_out.Rd | 15 +++++++-------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index b98fe8d..059e194 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -15,22 +15,23 @@ #' #' @details If `metrics` is `NULL` (the default), this function chooses #' appropriate metrics based on the `output_type` contained in the `model_out_tbl`: -#' - For `output_type == "quantile"`, we use the default metrics provided by -#' `scoringutils::metrics_quantile()`: "wis", "overprediction", "underprediction", -#' "dispersion", "bias", "interval_coverage_50", "interval_coverage_90", -#' "interval_coverage_deviation", and "ae_median" -#' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating -#' that the predicted variable is a nominal variable), we use the default metrics -#' provided by `scoringutils::metrics_nominal()`, currently just "log_score" -#' - For `output_type == "median"`, we use "ae_point" -#' - For `output_type == "mean"`, we use "se_point" +#' \itemize{ +#' \item For `output_type == "quantile"`, we use the default metrics provided by +#' `scoringutils::metrics_quantile()`: `r names(scoringutils::metrics_quantile())` +#' \item For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating +#' that the predicted variable is a nominal variable), we use the default metric +#' provided by `scoringutils::metrics_nominal()`, +#' `r names(scoringutils::metrics_nominal())` +#' \item For `output_type == "median"`, we use "ae_point" +#' \item For `output_type == "mean"`, we use "se_point" +#' } #' #' Alternatively, a character vector of scoring metrics can be provided. In this #' case, the following options are supported: -#' - `output_type == "median"` and `output_type == "median"`: -#' - "ae": absolute error of a point prediction (generally recommended for the median) -#' - "se": squared error of a point prediction (generally recommended for the mean) -#' - `output_type == "quantile"`: +#' - `output_type == "median"` and `output_type == "mean"`: +#' - "ae_point": absolute error of a point prediction (generally recommended for the median) +#' - "se_point": squared error of a point prediction (generally recommended for the mean) +#' - `output_type == "quantile"`: #' - "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5) #' - "wis": weighted interval score (WIS) of a collection of quantile predictions #' - "overprediction": The component of WIS measuring the extent to which @@ -42,7 +43,7 @@ #' - "interval_coverage_XX": interval coverage at the "XX" level. For example, #' "interval_coverage_95" is the 95% interval coverage rate, which would be calculated #' based on quantiles at the probability levels 0.025 and 0.975. -#' - `output_type == "pmf"`: +#' - `output_type == "pmf"`: #' - "log_score": log score #' #' For more flexibility, it is also possible to directly provide a list of diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd index 15fefb2..f7f8dba 100644 --- a/man/score_model_out.Rd +++ b/man/score_model_out.Rd @@ -43,12 +43,11 @@ If \code{metrics} is \code{NULL} (the default), this function chooses appropriate metrics based on the \code{output_type} contained in the \code{model_out_tbl}: \itemize{ \item For \code{output_type == "quantile"}, we use the default metrics provided by -\code{scoringutils::metrics_quantile()}: "wis", "overprediction", "underprediction", -"dispersion", "bias", "interval_coverage_50", "interval_coverage_90", -"interval_coverage_deviation", and "ae_median" +\code{scoringutils::metrics_quantile()}: wis, overprediction, underprediction, dispersion, bias, interval_coverage_50, interval_coverage_90, interval_coverage_deviation, ae_median \item For \code{output_type == "pmf"} and \code{output_type_id_order} is \code{NULL} (indicating -that the predicted variable is a nominal variable), we use the default metrics -provided by \code{scoringutils::metrics_nominal()}, currently just "log_score" +that the predicted variable is a nominal variable), we use the default metric +provided by \code{scoringutils::metrics_nominal()}, +log_score \item For \code{output_type == "median"}, we use "ae_point" \item For \code{output_type == "mean"}, we use "se_point" } @@ -56,10 +55,10 @@ provided by \code{scoringutils::metrics_nominal()}, currently just "log_score" Alternatively, a character vector of scoring metrics can be provided. In this case, the following options are supported: \itemize{ -\item \code{output_type == "median"} and \code{output_type == "median"}: +\item \code{output_type == "median"} and \code{output_type == "mean"}: \itemize{ -\item "ae": absolute error of a point prediction (generally recommended for the median) -\item "se": squared error of a point prediction (generally recommended for the mean) +\item "ae_point": absolute error of a point prediction (generally recommended for the median) +\item "se_point": squared error of a point prediction (generally recommended for the mean) } \item \code{output_type == "quantile"}: \itemize{ From 587c92202633b3f2f45aeec2cbed95e881b1dd5a Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Wed, 11 Sep 2024 16:53:28 -0400 Subject: [PATCH 30/31] do not support lists of functions in score_model_out --- R/score_model_out.R | 17 +++----- tests/testthat/test-score_model_out.R | 60 +++++---------------------- 2 files changed, 16 insertions(+), 61 deletions(-) diff --git a/R/score_model_out.R b/R/score_model_out.R index adfbd28..a2c8525 100644 --- a/R/score_model_out.R +++ b/R/score_model_out.R @@ -3,7 +3,7 @@ #' @param model_out_tbl Model output tibble with predictions #' @param target_observations Observed 'ground truth' data to be compared to #' predictions -#' @param metrics Optional list of scoring metrics to compute. See details for more. +#' @param metrics Optional character vector of scoring metrics to compute. See details for more. #' @param summarize Boolean indicator of whether summaries of forecast scores #' should be computed. Defaults to `TRUE`. #' @param by Character vector naming columns to summarize by. For example, @@ -46,12 +46,6 @@ #' - `output_type == "pmf"`: #' - "log_score": log score #' -#' For more flexibility, it is also possible to directly provide a list of -#' functions to compute the desired metrics, e.g. as would be created by one of -#' the `scoringutils::metrics_*` methods. Note that in this case, `hubEvals` -#' only validates that a list of functions has been provided; no checks for the -#' statistical validity of these metric functions are done. -#' #' @examplesIf requireNamespace("hubExamples", quietly = TRUE) #' # compute WIS and interval coverage rates at 80% and 90% levels based on #' # quantile forecasts, summarized by the mean score for each model @@ -60,7 +54,7 @@ #' dplyr::filter(.data[["output_type"]] == "quantile"), #' target_observations = hubExamples::forecast_target_observations, #' metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), -#' by = c("model_id") +#' by = "model_id" #' ) #' quantile_scores #' @@ -133,10 +127,9 @@ get_metrics <- function(metrics, output_type, output_type_id_order) { } else if (is.character(metrics)) { return(get_metrics_character(metrics, output_type)) } else { - # We do a minimal preliminary check that the provided `metrics` is a list of - # functions, leaving further checks to scoringutils - checkmate::assert_list(metrics, types = "function") - return(metrics) + cli::cli_abort( + "{.arg metrics} must be either `NULL` or a character vector of supported metrics." + ) } } diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R index a77c764..8987ee9 100644 --- a/tests/testthat/test-score_model_out.R +++ b/tests/testthat/test-score_model_out.R @@ -125,54 +125,6 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, charact }) -test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", { - # Forecast data from HubExamples: - load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs - load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations - - act_scores <- score_model_out( - model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), - target_observations = forecast_target_observations, - metrics = scoringutils::metrics_point(), - by = c("model_id", "location") - ) - - exp_scores <- forecast_outputs |> - dplyr::filter(.data[["output_type"]] == "mean") |> - dplyr::left_join( - forecast_target_observations |> - dplyr::filter(.data[["output_type"]] == "mean"), - by = c("location", "target_end_date", "target") - ) |> - dplyr::mutate( - ae = abs(.data[["value"]] - .data[["observation"]]), - se = (.data[["value"]] - .data[["observation"]])^2, - pe = abs(.data[["value"]] - .data[["observation"]]) / abs(.data[["observation"]]) - ) |> - dplyr::group_by(dplyr::across(dplyr::all_of( - c("model_id", "location") - ))) |> - dplyr::summarize( - ae_point = mean(.data[["ae"]]), - se_point = mean(.data[["se"]]), - ape = mean(.data[["pe"]]), - .groups = "drop" - ) - - # same column names, number of rows, and score values - expect_equal(colnames(act_scores), colnames(exp_scores)) - expect_equal(nrow(act_scores), nrow(exp_scores)) - merged_scores <- dplyr::full_join( - act_scores, exp_scores, - by = c("model_id", "location") - ) - expect_equal(nrow(act_scores), nrow(merged_scores)) - expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y) - expect_equal(merged_scores$se_point.x, merged_scores$se_point.y) - expect_equal(merged_scores$ape.x, merged_scores$ape.y) -}) - - test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", { # Forecast data from HubExamples: load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs @@ -482,7 +434,17 @@ test_that("score_model_out errors when invalid metrics are requested", { target_observations = forecast_target_observations, metrics = list(5, 6, "asdf") ), - regexp = "Assertion on 'metrics' failed" + regexp = "`metrics` must be either `NULL` or a character vector of supported metrics." + ) + + expect_error( + score_model_out( + model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"), + target_observations = forecast_target_observations, + metrics = scoringutils::metrics_point(), + by = c("model_id", "location") + ), + regexp = "`metrics` must be either `NULL` or a character vector of supported metrics." ) }) From 78f729535c7a16c5c9751ce32c14e1358c9ba44a Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Wed, 11 Sep 2024 16:56:57 -0400 Subject: [PATCH 31/31] update docs --- man/score_model_out.Rd | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd index f7f8dba..8c42a92 100644 --- a/man/score_model_out.Rd +++ b/man/score_model_out.Rd @@ -19,12 +19,12 @@ score_model_out( \item{target_observations}{Observed 'ground truth' data to be compared to predictions} -\item{metrics}{Optional list of scoring metrics to compute. See details for more.} +\item{metrics}{Optional character vector of scoring metrics to compute. See details for more.} -\item{summarize}{boolean indicator of whether summaries of forecast scores +\item{summarize}{Boolean indicator of whether summaries of forecast scores should be computed. Defaults to \code{TRUE}.} -\item{by}{character vector naming columns to summarize by. For example, +\item{by}{Character vector naming columns to summarize by. For example, specifying \code{by = "model_id"} (the default) will compute average scores for each model.} @@ -79,12 +79,6 @@ based on quantiles at the probability levels 0.025 and 0.975. \item "log_score": log score } } - -For more flexibility, it is also possible to directly provide a list of -functions to compute the desired metrics, e.g. as would be created by one of -the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEvals} -only validates that a list of functions has been provided; no checks for the -statistical validity of these metric functions are done. } \examples{ \dontshow{if (requireNamespace("hubExamples", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} @@ -95,7 +89,7 @@ quantile_scores <- score_model_out( dplyr::filter(.data[["output_type"]] == "quantile"), target_observations = hubExamples::forecast_target_observations, metrics = c("wis", "interval_coverage_80", "interval_coverage_90"), - by = c("model_id") + by = "model_id" ) quantile_scores