From 1f51495b4085aa3a4ce217593cc062404361896e Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Tue, 20 Aug 2024 10:00:31 -0400
Subject: [PATCH 01/31] initial draft of score_model_out

---
 NAMESPACE              |  1 +
 R/score_model_out.R    | 36 ++++++++++++++++++++++++++++++++++++
 R/validate.R           | 28 ++++++++++++++++++++++++++++
 man/score_model_out.Rd | 30 ++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+)
 create mode 100644 R/score_model_out.R
 create mode 100644 man/score_model_out.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 6ae9268..f267329 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,2 +1,3 @@
 # Generated by roxygen2: do not edit by hand
 
+export(score_model_out)
diff --git a/R/score_model_out.R b/R/score_model_out.R
new file mode 100644
index 0000000..d4b35ad
--- /dev/null
+++ b/R/score_model_out.R
@@ -0,0 +1,36 @@
+#' Score model output predictions against observed data
+#'
+#' @param model_out_tbl Model output tibble with predictions
+#' @param target_observations Observed 'ground truth' data to be compared against predictions
+#' @param metrics Optional list of scoring metrics to computeq
+#' @param output_type_id_order For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in
+#' increasing order of the levels. For all other output types, this is ignored.
+#'
+#' @return forecast_quantile
+#'
+#' @export
+score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, by = NULL,
+                            output_type_id_order = NULL) {
+  # check that model_out_tbl has a single output_type that is supported by this package
+  output_type <- validate_output_type(model_out_tbl)
+
+  # assemble data for scoringutils
+  if (output_type == "quantile") {
+    su_data <- transform_quantile_model_out(model_out_tbl, target_observations)
+    if (is.null(metrics)) {
+      metrics <- scoringutils::metrics_quantile()
+    }
+  } else if (output_type == "pmf") {
+    su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order)
+    if (is.null(metrics)) {
+      metrics <- scoringutils::metrics_nominal()
+    }
+  } else if (output_type %in% c("mean", "median")) {
+    su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type)
+  }
+
+  # compute scores
+  scores <- scoringutils::score(su_data, metrics)
+
+  return(scores)
+}
diff --git a/R/validate.R b/R/validate.R
index 2f0cb13..e4efcf9 100644
--- a/R/validate.R
+++ b/R/validate.R
@@ -55,3 +55,31 @@ validate_model_out_target_obs <- function(model_out_tbl, target_observations) {
 
   return(model_out_tbl)
 }
+
+
+#' Check that model_out_tble has a single `output_type` that is one of the
+#' `output_types` that is supported by this function.
+#'
+#' @return if valid, the output_type in model_out_tbl
+#'
+#' @noRd
+validate_output_type <- function(model_out_tbl) {
+  output_type <- unique(model_out_tbl$output_type)
+  if (length(output_type) != 1) {
+    cli::cli_abort(
+      "model_out_tbl must contain a single output_type, but it has multiple:
+      {.val {output_type}}"
+    )
+  }
+
+  supported_types <- c("mean", "median", "pmf", "quantile")
+  if (!output_type %in% supported_types) {
+    cli::cli_abort(
+      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
+      hubEvals currently only supports the following types:
+      {.val {supported_types}}"
+    )
+  }
+
+  return(output_type)
+}
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
new file mode 100644
index 0000000..fb2c174
--- /dev/null
+++ b/man/score_model_out.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/score_model_out.R
+\name{score_model_out}
+\alias{score_model_out}
+\title{Score model output predictions against observed data}
+\usage{
+score_model_out(
+  model_out_tbl,
+  target_observations,
+  metrics = NULL,
+  by = NULL,
+  output_type_id_order = NULL
+)
+}
+\arguments{
+\item{model_out_tbl}{Model output tibble with predictions}
+
+\item{target_observations}{Observed 'ground truth' data to be compared against predictions}
+
+\item{metrics}{Optional list of scoring metrics to computeq}
+
+\item{output_type_id_order}{For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in
+increasing order of the levels. For all other output types, this is ignored.}
+}
+\value{
+forecast_quantile
+}
+\description{
+Score model output predictions against observed data
+}

From e101808fcd8a8bff8ea5fe6417b1b59b7004ebc2 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 22 Aug 2024 11:32:46 -0400
Subject: [PATCH 02/31] some partial progress on score_model_out

---
 R/score_model_out.R | 119 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 112 insertions(+), 7 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index d4b35ad..b46dc6f 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -1,20 +1,46 @@
-#' Score model output predictions against observed data
+#' Score model output predictions with a single `output_type` against observed data
 #'
 #' @param model_out_tbl Model output tibble with predictions
-#' @param target_observations Observed 'ground truth' data to be compared against predictions
-#' @param metrics Optional list of scoring metrics to computeq
-#' @param output_type_id_order For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in
-#' increasing order of the levels. For all other output types, this is ignored.
+#' @param target_observations Observed 'ground truth' data to be compared to
+#' predictions
+#' @param metrics Optional list of scoring metrics to compute. See details for more.
+#' @param summarize boolean indicator of whether summaries of forecast scores
+#' should be computed. Defaults to `TRUE`.
+#' @param by character vector naming columns to summarize by. For example,
+#' specifying `by = "model_id"` will compute average scores for each model. The
+#' default, `NULL`, will summarize across `output_type_id` levels (e.g.,
+#' quantile levels or categories for a pmf forecast) to produce a single row of
+#' scores for each combination of model and prediction task.
+#' @param output_type_id_order For ordinal variables in pmf format, this is a
+#' vector of levels for pmf forecasts, in increasing order of the levels. For
+#' all other output types, this is ignored.
+#'
+#' @details If `metrics` is `NULL` (the default), this function chooses
+#' appropriate metrics based on the `output_type` contained in the `model_out_tbl`:
+#' - For `output_type == "quantile"`, we use the default metrics provided by
+#' `scoringutils::metrics_quantile()`: "wis", "overprediction", "underprediction",
+#' "dispersion", "bias", "interval_coverage_50", "interval_coverage_90",
+#' "interval_coverage_deviation", and "ae_median"
+#' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating
+#' that the predicted variable is a nominal variable), we use the default metrics
+#' provided by `scoringutils::metrics_nominal()`
+#' - For `output_type == "median"`, we use "ae_point"
+#' - For `output_type == "mean"`, we use "se_point"
 #'
 #' @return forecast_quantile
 #'
 #' @export
-score_model_out <- function(model_out_tbl, target_observations, metrics = NULL, by = NULL,
+score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
+                            summarize = TRUE, by = NULL,
                             output_type_id_order = NULL) {
   # check that model_out_tbl has a single output_type that is supported by this package
+  # also, retrieve that output_type
   output_type <- validate_output_type(model_out_tbl)
 
-  # assemble data for scoringutils
+  # get/validate the scoring metrics
+  metrics <- get_metrics(metrics, output_type, output_type_id_order)
+
+  # assemble data for scoringutils and select output_type-specific metrics
   if (output_type == "quantile") {
     su_data <- transform_quantile_model_out(model_out_tbl, target_observations)
     if (is.null(metrics)) {
@@ -27,10 +53,89 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
     }
   } else if (output_type %in% c("mean", "median")) {
     su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type)
+    if (is.null(metrics)) {
+      metrics <- scoringutils::metrics_point(
+        select = ifelse(output_type == "mean", "se_point", "ae_point")
+      )
+    }
   }
 
   # compute scores
   scores <- scoringutils::score(su_data, metrics)
 
+  # switch back to hubverse convention of "model_id" rather than "model"
+  scores <- dplyr::rename(scores, model_id = "model")
+
+  # if requested, summarize scores
+  if (summarize) {
+    scores <- scoringutils::summarize_scores(scores = scores, by = by)
+  }
+
   return(scores)
 }
+
+
+#' Get metrics if user didn't specify anything; otherwise, process
+#' and validate user inputs
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics <- function(metrics, output_type, output_type_id_order) {
+  if (is.null(metrics)) {
+    return(get_metrics_default(output_type, output_type_id_order))
+  } else if (is.character(metrics)) {
+    return(get_metrics_character(metrics))
+  } else {
+    # We do a minimal preliminary check that the provided `metrics` is a list of
+    # functions, leaving detailed checks to scoringutils
+    checkmate::assert_list(metrics, types = "function")
+    return(metrics)
+  }
+}
+
+
+#' Default metrics if user didn't specify anything
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics_default <- function(output_type, output_type_id_order) {
+  if (output_type == "quantile") {
+    metrics <- scoringutils::metrics_quantile()
+  } else if (output_type == "pmf") {
+    metrics <- scoringutils::metrics_nominal()
+  } else if (output_type == "mean") {
+    metrics <- scoringutils::metrics_point(select = "se_point")
+  } else if (output_type == "median") {
+    metrics <- scoringutils::metrics_point(select = "ae_point")
+  } else {
+    # we have already validated `output_type`, so this case should not be
+    # triggered; this case is just double checking in case we add something new
+    # later.
+    supported_types <- c("mean", "median", "pmf", "quantile")
+    cli::cli_abort(
+      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
+      hubEvals currently only supports the following types:
+      {.val {supported_types}}"
+    )
+  }
+
+  return(metrics)
+}
+
+
+#' Convert character vector of metrics to list of functions
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics_character <- function(metrics) {
+  
+}

From 448e8303f63d602837e4d5e00f645212b2a2de55 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 22 Aug 2024 16:49:04 -0400
Subject: [PATCH 03/31] updates to score_model_out

---
 R/score_model_out.R                   | 136 +++++--
 man/score_model_out.Rd                |  68 +++-
 tests/testthat/test-score_model_out.R | 491 ++++++++++++++++++++++++++
 3 files changed, 664 insertions(+), 31 deletions(-)
 create mode 100644 tests/testthat/test-score_model_out.R

diff --git a/R/score_model_out.R b/R/score_model_out.R
index b46dc6f..c5ba9f2 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -7,10 +7,8 @@
 #' @param summarize boolean indicator of whether summaries of forecast scores
 #' should be computed. Defaults to `TRUE`.
 #' @param by character vector naming columns to summarize by. For example,
-#' specifying `by = "model_id"` will compute average scores for each model. The
-#' default, `NULL`, will summarize across `output_type_id` levels (e.g.,
-#' quantile levels or categories for a pmf forecast) to produce a single row of
-#' scores for each combination of model and prediction task.
+#' specifying `by = "model_id"` (the default) will compute average scores for
+#' each model.
 #' @param output_type_id_order For ordinal variables in pmf format, this is a
 #' vector of levels for pmf forecasts, in increasing order of the levels. For
 #' all other output types, this is ignored.
@@ -23,15 +21,36 @@
 #' "interval_coverage_deviation", and "ae_median"
 #' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating
 #' that the predicted variable is a nominal variable), we use the default metrics
-#' provided by `scoringutils::metrics_nominal()`
+#' provided by `scoringutils::metrics_nominal()`, currently just "log_score"
 #' - For `output_type == "median"`, we use "ae_point"
 #' - For `output_type == "mean"`, we use "se_point"
 #'
+#' It is also possible to directly provide a list of metrics, e.g. as would be
+#' created by one of those function calls.  Alternatively, a character vector of
+#' scoring metrics can be provided. In this case, the following options are supported:
+#' - `output_type == "median"` and `output_type == "median"`:
+#'     - "ae": absolute error of a point prediction (generally recommended for the median)
+#'     - "se": squared error of a point prediction (generally recommended for the mean)
+#' - `output_type == "quantile"`:
+#'     - "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5)
+#'     - "wis": weighted interval score (WIS) of a collection of quantile predictions
+#'     - "overprediction": The component of WIS measuring the extent to which
+#' predictions fell above the observation.
+#'     - "underprediction": The component of WIS measuring the extent to which
+#' predictions fell below the observation.
+#'     - "dispersion":  The component of WIS measuring the dispersion of forecast
+#' distributions.
+#'     - "interval_coverage_XX": interval coverage at the "XX" level. For example,
+#' "interval_coverage_95" is the 95% interval coverage rate, which would be calculated
+#' based on quantiles at the probability levels 0.025 and 0.975.
+#' - `output_type == "pmf"`:
+#'     - "log_score": log score
+#'
 #' @return forecast_quantile
 #'
 #' @export
 score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
-                            summarize = TRUE, by = NULL,
+                            summarize = TRUE, by = "model_id",
                             output_type_id_order = NULL) {
   # check that model_out_tbl has a single output_type that is supported by this package
   # also, retrieve that output_type
@@ -43,28 +62,24 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
   # assemble data for scoringutils and select output_type-specific metrics
   if (output_type == "quantile") {
     su_data <- transform_quantile_model_out(model_out_tbl, target_observations)
-    if (is.null(metrics)) {
-      metrics <- scoringutils::metrics_quantile()
-    }
   } else if (output_type == "pmf") {
     su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order)
-    if (is.null(metrics)) {
-      metrics <- scoringutils::metrics_nominal()
-    }
   } else if (output_type %in% c("mean", "median")) {
     su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type)
-    if (is.null(metrics)) {
-      metrics <- scoringutils::metrics_point(
-        select = ifelse(output_type == "mean", "se_point", "ae_point")
-      )
-    }
   }
 
   # compute scores
   scores <- scoringutils::score(su_data, metrics)
 
-  # switch back to hubverse convention of "model_id" rather than "model"
-  scores <- dplyr::rename(scores, model_id = "model")
+  # switch back to hubverse naming conventions for model name
+  scores <- dplyr::rename(
+    scores,
+    model_id = "model"
+  )
+
+  # if present, drop predicted and observed columns
+  drop_cols <- c("predicted", "observed")
+  scores <- scores[!colnames(scores) %in% drop_cols]
 
   # if requested, summarize scores
   if (summarize) {
@@ -87,10 +102,10 @@ get_metrics <- function(metrics, output_type, output_type_id_order) {
   if (is.null(metrics)) {
     return(get_metrics_default(output_type, output_type_id_order))
   } else if (is.character(metrics)) {
-    return(get_metrics_character(metrics))
+    return(get_metrics_character(metrics, output_type))
   } else {
     # We do a minimal preliminary check that the provided `metrics` is a list of
-    # functions, leaving detailed checks to scoringutils
+    # functions, leaving further checks to scoringutils
     checkmate::assert_list(metrics, types = "function")
     return(metrics)
   }
@@ -116,7 +131,7 @@ get_metrics_default <- function(output_type, output_type_id_order) {
   } else {
     # we have already validated `output_type`, so this case should not be
     # triggered; this case is just double checking in case we add something new
-    # later.
+    # later, to ensure we update this function.
     supported_types <- c("mean", "median", "pmf", "quantile")
     cli::cli_abort(
       "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
@@ -136,6 +151,79 @@ get_metrics_default <- function(output_type, output_type_id_order) {
 #' @return a list of metric functions as required by scoringutils::score()
 #'
 #' @noRd
-get_metrics_character <- function(metrics) {
-  
+get_metrics_character <- function(metrics, output_type) {
+  if (output_type == "quantile") {
+    # split into metrics for interval coverage and others
+    interval_metric_inds <- grepl(pattern = "^interval_coverage_[[:digit:]][[:digit:]]$", metrics)
+    interval_metrics <- metrics[interval_metric_inds]
+    other_metrics <- metrics[!interval_metric_inds]
+
+    # validate metrics
+    valid_metrics <- c("ae_median", "wis", "overprediction", "underprediction", "dispersion")
+    invalid_metrics <- other_metrics[!other_metrics %in% valid_metrics]
+    valid_metrics <- c(valid_metrics, "interval_coverage_XY")
+    if (length(invalid_metrics) > 0) {
+      cli::cli_abort(
+        c(
+          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
+          " `output_type` {.val {output_type}}: {.val {invalid_metrics}};",
+          " supported metrics include {.val {valid_metrics}}",
+          " where `XY` denotes the coverage level, e.g. \"interval_coverage_95\"."
+        )
+      )
+    }
+
+    # assemble metric functions
+    interval_metric_fns <- lapply(
+      interval_metrics,
+      function(metric) {
+        level <- as.integer(substr(metric, 19, 20))
+        return(purrr::partial(scoringutils::interval_coverage, interval_range = level))
+      }
+    )
+    names(interval_metric_fns) <- interval_metrics
+
+    other_metric_fns <- scoringutils::metrics_quantile(select = other_metrics)
+
+    metric_fns <- c(other_metric_fns, interval_metric_fns)[metrics]
+    return(metric_fns)
+  } else if (output_type == "pmf") {
+    valid_metrics <- c("log_score")
+    invalid_metrics <- metrics[!metrics %in% valid_metrics]
+    if (length(invalid_metrics) > 0) {
+      cli::cli_abort(
+        c(
+          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
+          " `output_type` {.val{output_type}}: {.val {invalid_metrics}};",
+          " supported metrics include {.val {valid_metrics}}."
+        )
+      )
+    }
+    metrics <- scoringutils::metrics_nominal(select = metrics)
+  } else if (output_type %in% c("median", "mean")) {
+    valid_metrics <- c("ae_point", "se_point")
+    invalid_metrics <- metrics[!metrics %in% valid_metrics]
+    if (length(invalid_metrics) > 0) {
+      cli::cli_abort(
+        c(
+          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
+          " `output_type` {.val{output_type}}: {.val {invalid_metrics}};",
+          " supported metrics include {.val {valid_metrics}}."
+        )
+      )
+    }
+    metrics <- scoringutils::metrics_point(select = metrics)
+  } else {
+    # we have already validated `output_type`, so this case should not be
+    # triggered; this case is just double checking in case we add something new
+    # later, to ensure we update this function.
+    supported_types <- c("mean", "median", "pmf", "quantile")
+    cli::cli_abort(
+      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
+      hubEvals currently only supports the following types:
+      {.val {supported_types}}"
+    )
+  }
+
+  return(metrics)
 }
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
index fb2c174..69b31dc 100644
--- a/man/score_model_out.Rd
+++ b/man/score_model_out.Rd
@@ -2,29 +2,83 @@
 % Please edit documentation in R/score_model_out.R
 \name{score_model_out}
 \alias{score_model_out}
-\title{Score model output predictions against observed data}
+\title{Score model output predictions with a single \code{output_type} against observed data}
 \usage{
 score_model_out(
   model_out_tbl,
   target_observations,
   metrics = NULL,
-  by = NULL,
+  summarize = TRUE,
+  by = "model_id",
   output_type_id_order = NULL
 )
 }
 \arguments{
 \item{model_out_tbl}{Model output tibble with predictions}
 
-\item{target_observations}{Observed 'ground truth' data to be compared against predictions}
+\item{target_observations}{Observed 'ground truth' data to be compared to
+predictions}
 
-\item{metrics}{Optional list of scoring metrics to computeq}
+\item{metrics}{Optional list of scoring metrics to compute. See details for more.}
 
-\item{output_type_id_order}{For ordinal variables in pmf format, this is a vector of levels for pmf forecasts, in
-increasing order of the levels. For all other output types, this is ignored.}
+\item{summarize}{boolean indicator of whether summaries of forecast scores
+should be computed. Defaults to \code{TRUE}.}
+
+\item{by}{character vector naming columns to summarize by. For example,
+specifying \code{by = "model_id"} (the default) will compute average scores for
+each model.}
+
+\item{output_type_id_order}{For ordinal variables in pmf format, this is a
+vector of levels for pmf forecasts, in increasing order of the levels. For
+all other output types, this is ignored.}
 }
 \value{
 forecast_quantile
 }
 \description{
-Score model output predictions against observed data
+Score model output predictions with a single \code{output_type} against observed data
+}
+\details{
+If \code{metrics} is \code{NULL} (the default), this function chooses
+appropriate metrics based on the \code{output_type} contained in the \code{model_out_tbl}:
+\itemize{
+\item For \code{output_type == "quantile"}, we use the default metrics provided by
+\code{scoringutils::metrics_quantile()}: "wis", "overprediction", "underprediction",
+"dispersion", "bias", "interval_coverage_50", "interval_coverage_90",
+"interval_coverage_deviation", and "ae_median"
+\item For \code{output_type == "pmf"} and \code{output_type_id_order} is \code{NULL} (indicating
+that the predicted variable is a nominal variable), we use the default metrics
+provided by \code{scoringutils::metrics_nominal()}, currently just "log_score"
+\item For \code{output_type == "median"}, we use "ae_point"
+\item For \code{output_type == "mean"}, we use "se_point"
+}
+
+It is also possible to directly provide a list of metrics, e.g. as would be
+created by one of those function calls.  Alternatively, a character vector of
+scoring metrics can be provided. In this case, the following options are supported:
+\itemize{
+\item \code{output_type == "median"} and \code{output_type == "median"}:
+\itemize{
+\item "ae": absolute error of a point prediction (generally recommended for the median)
+\item "se": squared error of a point prediction (generally recommended for the mean)
+}
+\item \code{output_type == "quantile"}:
+\itemize{
+\item "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5)
+\item "wis": weighted interval score (WIS) of a collection of quantile predictions
+\item "overprediction": The component of WIS measuring the extent to which
+predictions fell above the observation.
+\item "underprediction": The component of WIS measuring the extent to which
+predictions fell below the observation.
+\item "dispersion":  The component of WIS measuring the dispersion of forecast
+distributions.
+\item "interval_coverage_XX": interval coverage at the "XX" level. For example,
+"interval_coverage_95" is the 95\% interval coverage rate, which would be calculated
+based on quantiles at the probability levels 0.025 and 0.975.
+}
+\item \code{output_type == "pmf"}:
+\itemize{
+\item "log_score": log score
+}
+}
 }
diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
new file mode 100644
index 0000000..9f18d4a
--- /dev/null
+++ b/tests/testthat/test-score_model_out.R
@@ -0,0 +1,491 @@
+test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, by all", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, summarize FALSE", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    metrics = c("ae_point", "se_point"),
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      ae = abs(.data[["value"]] - .data[["observation"]]),
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      ae_point = mean(.data[["ae"]]),
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    metrics = scoringutils::metrics_point(),
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      ae = abs(.data[["value"]] - .data[["observation"]]),
+      se = (.data[["value"]] - .data[["observation"]])^2,
+      pe = abs(.data[["value"]] - .data[["observation"]]) / abs(.data[["observation"]])
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      ae_point = mean(.data[["ae"]]),
+      se_point = mean(.data[["se"]]),
+      ape = mean(.data[["pe"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+  expect_equal(merged_scores$ape.x, merged_scores$ape.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "median"),
+    target_observations = forecast_target_observations,
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "median") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "median"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      ae = abs(.data[["value"]] - .data[["observation"]])
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      ae_point = mean(.data[["ae"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: quantile output_type, wis and interval metrics, custom by", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+    target_observations = forecast_target_observations,
+    metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "quantile") |>
+        dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      output_type_id = as.numeric(.data[["output_type_id"]]),
+      qs = ifelse(
+        .data[["observation"]] >= .data[["value"]],
+        .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]),
+        (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]])
+      ),
+      q_coverage_80_lower = ifelse(
+        .data[["output_type_id"]] == 0.1,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_80_upper = ifelse(
+        .data[["output_type_id"]] == 0.9,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_lower = ifelse(
+        .data[["output_type_id"]] == 0.05,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_upper = ifelse(
+        .data[["output_type_id"]] == 0.95,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      )
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      wis = 2 * mean(.data[["qs"]]),
+      interval_coverage_80 = (sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) *
+        (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1),
+      interval_coverage_90 = (sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) *
+        (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      wis = mean(.data[["wis"]]),
+      interval_coverage_80 = mean(.data[["interval_coverage_80"]], na.rm = TRUE),
+      interval_coverage_90 = mean(.data[["interval_coverage_90"]], na.rm = TRUE),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$wis.x, merged_scores$wis.y)
+  expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y)
+  expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: quantile output_type, wis/interval metrics, summarize FALSE", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+    target_observations = forecast_target_observations,
+    metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "quantile") |>
+        dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      output_type_id = as.numeric(.data[["output_type_id"]]),
+      qs = ifelse(
+        .data[["observation"]] >= .data[["value"]],
+        .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]),
+        (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]])
+      ),
+      q_coverage_80_lower = ifelse(
+        .data[["output_type_id"]] == 0.1,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_80_upper = ifelse(
+        .data[["output_type_id"]] == 0.9,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_lower = ifelse(
+        .data[["output_type_id"]] == 0.05,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_upper = ifelse(
+        .data[["output_type_id"]] == 0.95,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      )
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      wis = 2 * mean(.data[["qs"]]),
+      interval_coverage_80 = as.logical((sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) *
+                                          (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1)),
+      interval_coverage_90 = as.logical((sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) *
+                                          (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1))
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$wis.x, merged_scores$wis.y)
+  expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y)
+  expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, default metrics, custom by", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "pmf"),
+    target_observations = forecast_target_observations,
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "pmf") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "pmf") |>
+        dplyr::select(-dplyr::all_of(c("output_type"))),
+      by = c("location", "target_end_date", "target", "output_type_id")
+    ) |>
+    dplyr::filter(.data[["observation"]] == 1) |>
+    dplyr::mutate(
+      log_score = -1 * log(.data[["value"]])
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      log_score = mean(.data[["log_score"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+})
+
+
+test_that("score_model_out errors when model_out_tbl has multiple output_types", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs,
+      target_observations = forecast_target_observations
+    ),
+    regexp = "model_out_tbl must contain a single output_type, but it has multiple"
+  )
+})
+
+
+test_that("score_model_out errors when invalid interval levels are requested", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_5"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_100"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_XY"
+    ),
+    regexp = "unsupported metric"
+  )
+})
+
+
+test_that("score_model_out errors when invalid metrics are requested", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "log_score"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = list(5, 6, "asdf")
+    ),
+    regexp = "Assertion on 'metrics' failed"
+  )
+})
+
+
+
+test_that("score_model_out errors when an unsupported output_type is provided", {
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "cdf"),
+      target_observations = forecast_target_observations,
+      metrics = "log_score"
+    ),
+    regexp = "only supports the following types"
+  )
+})

From b92891a1953988cc893fb27256998694ee1527e8 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 22 Aug 2024 16:52:56 -0400
Subject: [PATCH 04/31] appease the linter

---
 R/score_model_out.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index c5ba9f2..6fdcf4f 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -132,7 +132,7 @@ get_metrics_default <- function(output_type, output_type_id_order) {
     # we have already validated `output_type`, so this case should not be
     # triggered; this case is just double checking in case we add something new
     # later, to ensure we update this function.
-    supported_types <- c("mean", "median", "pmf", "quantile")
+    supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter
     cli::cli_abort(
       "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
       hubEvals currently only supports the following types:
@@ -217,7 +217,7 @@ get_metrics_character <- function(metrics, output_type) {
     # we have already validated `output_type`, so this case should not be
     # triggered; this case is just double checking in case we add something new
     # later, to ensure we update this function.
-    supported_types <- c("mean", "median", "pmf", "quantile")
+    supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter
     cli::cli_abort(
       "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
       hubEvals currently only supports the following types:

From d6a6b362bfd994f51328f6540aa0194744e7e003 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 22 Aug 2024 16:54:02 -0400
Subject: [PATCH 05/31] update package imports

---
 DESCRIPTION | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 66558a6..387e8e9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,10 +38,12 @@ Description:
     basic tools for scoring hubverse forecasts.
 License: MIT + file LICENSE
 Encoding: UTF-8
-Imports: 
+Imports:
+    checkmate, 
     cli,
     dplyr,
     hubUtils,
+    purrr,
     rlang,
     scoringutils (>= 1.2.2.9000)
 Remotes:

From 28724e4d9da1e8ecf5eb6bb3281c534b4a69da2b Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:41:39 -0400
Subject: [PATCH 06/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 9f18d4a..d1704d7 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -1,4 +1,5 @@
 test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, by all", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 41c8c509dbb5cb46f4a398ce2dd13fcdbf3b8624 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:41:58 -0400
Subject: [PATCH 07/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index d1704d7..8585c32 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -40,6 +40,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default
 
 
 test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From fd06ad7271c222270fa195004ac36879e536990b Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:42:06 -0400
Subject: [PATCH 08/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 8585c32..5fd43f3 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -81,6 +81,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default
 
 
 test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", {
+   # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From fa64a8eda9d2df8f31d1afdf81ee6365624045c0 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:42:12 -0400
Subject: [PATCH 09/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 5fd43f3..15a05b8 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -126,6 +126,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, charact
 
 
 test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 7e3ce73d04e1c72ad185124cbdd85d54199c6479 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:42:25 -0400
Subject: [PATCH 10/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 15a05b8..6d16b4e 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -174,6 +174,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, functio
 
 
 test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 5b8eb40dd0d349cf4f9315cdc6b9f9d342ece82f Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:51:20 -0400
Subject: [PATCH 11/31] Update R/score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 R/score_model_out.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index 6fdcf4f..6d51a39 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -60,13 +60,13 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
   metrics <- get_metrics(metrics, output_type, output_type_id_order)
 
   # assemble data for scoringutils and select output_type-specific metrics
-  if (output_type == "quantile") {
-    su_data <- transform_quantile_model_out(model_out_tbl, target_observations)
-  } else if (output_type == "pmf") {
-    su_data <- transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order)
-  } else if (output_type %in% c("mean", "median")) {
-    su_data <- transform_point_model_out(model_out_tbl, target_observations, output_type)
-  }
+  su_data <- switch(output_type,
+    quantile = transform_quantile_model_out(model_out_tbl, target_observations),
+    pmf = transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order),
+    mean = transform_point_model_out(model_out_tbl, target_observations, output_type),
+    median = transform_point_model_out(model_out_tbl, target_observations, output_type),
+    NULL # default, should not happen because of the validation above
+  )
 
   # compute scores
   scores <- scoringutils::score(su_data, metrics)

From a68fabbfc7e71a15e3900a46ec3e2dc8202c9b4c Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:54:42 -0400
Subject: [PATCH 12/31] Update R/score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 R/score_model_out.R | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index 6d51a39..d6d9a78 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -120,15 +120,14 @@ get_metrics <- function(metrics, output_type, output_type_id_order) {
 #'
 #' @noRd
 get_metrics_default <- function(output_type, output_type_id_order) {
-  if (output_type == "quantile") {
-    metrics <- scoringutils::metrics_quantile()
-  } else if (output_type == "pmf") {
-    metrics <- scoringutils::metrics_nominal()
-  } else if (output_type == "mean") {
-    metrics <- scoringutils::metrics_point(select = "se_point")
-  } else if (output_type == "median") {
-    metrics <- scoringutils::metrics_point(select = "ae_point")
-  } else {
+  metrics <- switch(output_type,
+    quantile = scoringutils::metrics_quantile(),
+    pmf = scoringutils::metrics_nominal(),
+    mean = scoringutils::metrics_point(select = "se_point"),
+    median = scoringutils::metrics_point(select = "ae_point"),
+    NULL # default
+  )
+  if (is.null(metrics)) {
     # we have already validated `output_type`, so this case should not be
     # triggered; this case is just double checking in case we add something new
     # later, to ensure we update this function.

From a6a9c3b3d6695afe84725dc35f63f30b47bc4c84 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:00:01 -0400
Subject: [PATCH 13/31] Update R/score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 R/score_model_out.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index d6d9a78..8993741 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -185,7 +185,7 @@ get_metrics_character <- function(metrics, output_type) {
     other_metric_fns <- scoringutils::metrics_quantile(select = other_metrics)
 
     metric_fns <- c(other_metric_fns, interval_metric_fns)[metrics]
-    return(metric_fns)
+    metrics <- metric_fns
   } else if (output_type == "pmf") {
     valid_metrics <- c("log_score")
     invalid_metrics <- metrics[!metrics %in% valid_metrics]

From 7c8e86d59dce8fa960eba79750f4c2d5962eb5c3 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:11:35 -0400
Subject: [PATCH 14/31] Update R/score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 R/score_model_out.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index 8993741..ea4877b 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -192,8 +192,8 @@ get_metrics_character <- function(metrics, output_type) {
     if (length(invalid_metrics) > 0) {
       cli::cli_abort(
         c(
-          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
-          " `output_type` {.val{output_type}}: {.val {invalid_metrics}};",
+          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for
+          `output_type` {.val {output_type}}: {.val {invalid_metrics}};",
           " supported metrics include {.val {valid_metrics}}."
         )
       )

From 47f29a297916fd6908bc57358fa6adac31251a6b Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:13:22 -0400
Subject: [PATCH 15/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 6d16b4e..f417e0d 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -215,6 +215,7 @@ test_that("score_model_out succeeds with valid inputs: median output_type, defau
 
 
 test_that("score_model_out succeeds with valid inputs: quantile output_type, wis and interval metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From fed85acc8b06ed9d31b5cd40b5f977b90007ec3d Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:13:40 -0400
Subject: [PATCH 16/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index f417e0d..3890d08 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -369,6 +369,7 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis
 
 
 test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, default metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 4c6eabca84823db65f1077287ccadc64254c32e4 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:13:53 -0400
Subject: [PATCH 17/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 3890d08..269c879 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -484,6 +484,7 @@ test_that("score_model_out errors when invalid metrics are requested", {
 
 
 test_that("score_model_out errors when an unsupported output_type is provided", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 6c443e6c823f958e1aaf4d1bf883fd37a1c0b3e5 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:14:08 -0400
Subject: [PATCH 18/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 269c879..0c0276a 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -459,6 +459,7 @@ test_that("score_model_out errors when invalid interval levels are requested", {
 
 
 test_that("score_model_out errors when invalid metrics are requested", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 778be41cc0b1a35f9339ac13a207fb6fea91df4c Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:14:29 -0400
Subject: [PATCH 19/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 0c0276a..05ba525 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -412,6 +412,7 @@ test_that("score_model_out succeeds with valid inputs: nominal pmf output_type,
 
 
 test_that("score_model_out errors when model_out_tbl has multiple output_types", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From f27b164afb888d1242f88f5c55f88e64c9d13054 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:14:49 -0400
Subject: [PATCH 20/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 05ba525..7533f78 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -427,6 +427,7 @@ test_that("score_model_out errors when model_out_tbl has multiple output_types",
 
 
 test_that("score_model_out errors when invalid interval levels are requested", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 23675188f62b9d1540f2be233b5a64d61b1e7ba3 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:14:59 -0400
Subject: [PATCH 21/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 7533f78..17ab677 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -297,6 +297,7 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis
 
 
 test_that("score_model_out succeeds with valid inputs: quantile output_type, wis/interval metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 2c740d710ce74049b3a3dc6391fbc6b55a20dabf Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:15:50 -0400
Subject: [PATCH 22/31] Update tests/testthat/test-score_model_out.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/test-score_model_out.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index 17ab677..b8f8300 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -352,7 +352,8 @@ test_that("score_model_out succeeds with valid inputs: quantile output_type, wis
       interval_coverage_80 = as.logical((sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) *
                                           (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1)),
       interval_coverage_90 = as.logical((sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) *
-                                          (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1))
+                                          (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)),
+      .groups = "drop"
     )
 
   # same column names, number of rows, and score values

From 79a48285b628ce5ead6243c1549d4fe5ffb2457d Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Mon, 26 Aug 2024 10:32:03 -0400
Subject: [PATCH 23/31] updates from review: document lack of checks for
 statistical validity of metric functions, split out validation for
 output_type

---
 R/score_model_out.R    | 18 +++++++++---------
 R/validate.R           |  9 +++++++--
 man/score_model_out.Rd | 11 ++++++++---
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index ea4877b..2d97b84 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -25,9 +25,8 @@
 #' - For `output_type == "median"`, we use "ae_point"
 #' - For `output_type == "mean"`, we use "se_point"
 #'
-#' It is also possible to directly provide a list of metrics, e.g. as would be
-#' created by one of those function calls.  Alternatively, a character vector of
-#' scoring metrics can be provided. In this case, the following options are supported:
+#' Alternatively, a character vector of scoring metrics can be provided. In this
+#' case, the following options are supported:
 #' - `output_type == "median"` and `output_type == "median"`:
 #'     - "ae": absolute error of a point prediction (generally recommended for the median)
 #'     - "se": squared error of a point prediction (generally recommended for the mean)
@@ -46,6 +45,12 @@
 #' - `output_type == "pmf"`:
 #'     - "log_score": log score
 #'
+#' For more flexibility, it is also possible to directly provide a list of
+#' functions to compute the desired metrics, e.g. as would be created by one of
+#' the `scoringutils::metrics_*` methods. Note that in this case, `hubEvals`
+#' only validates that a list of functions has been provided; no checks for the
+#' statistical validity of these metric functions are done.
+#'
 #' @return forecast_quantile
 #'
 #' @export
@@ -216,12 +221,7 @@ get_metrics_character <- function(metrics, output_type) {
     # we have already validated `output_type`, so this case should not be
     # triggered; this case is just double checking in case we add something new
     # later, to ensure we update this function.
-    supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter
-    cli::cli_abort(
-      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
-      hubEvals currently only supports the following types:
-      {.val {supported_types}}"
-    )
+    error_if_invalid_output_type(output_type)
   }
 
   return(metrics)
diff --git a/R/validate.R b/R/validate.R
index 0f00d61..07db0c7 100644
--- a/R/validate.R
+++ b/R/validate.R
@@ -83,6 +83,13 @@ validate_output_type <- function(model_out_tbl) {
     )
   }
 
+  error_if_invalid_output_type(output_type)
+
+  return(output_type)
+}
+
+
+error_if_invalid_output_type <- function(output_type) {
   supported_types <- c("mean", "median", "pmf", "quantile")
   if (!output_type %in% supported_types) {
     cli::cli_abort(
@@ -91,6 +98,4 @@ validate_output_type <- function(model_out_tbl) {
       {.val {supported_types}}"
     )
   }
-
-  return(output_type)
 }
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
index 69b31dc..6fcd864 100644
--- a/man/score_model_out.Rd
+++ b/man/score_model_out.Rd
@@ -53,9 +53,8 @@ provided by \code{scoringutils::metrics_nominal()}, currently just "log_score"
 \item For \code{output_type == "mean"}, we use "se_point"
 }
 
-It is also possible to directly provide a list of metrics, e.g. as would be
-created by one of those function calls.  Alternatively, a character vector of
-scoring metrics can be provided. In this case, the following options are supported:
+Alternatively, a character vector of scoring metrics can be provided. In this
+case, the following options are supported:
 \itemize{
 \item \code{output_type == "median"} and \code{output_type == "median"}:
 \itemize{
@@ -81,4 +80,10 @@ based on quantiles at the probability levels 0.025 and 0.975.
 \item "log_score": log score
 }
 }
+
+For more flexibility, it is also possible to directly provide a list of
+functions to compute the desired metrics, e.g. as would be created by one of
+the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEvals}
+only validates that a list of functions has been provided; no checks for the
+statistical validity of these metric functions are done.
 }

From 6e09adc24d9bcbd97cf3c4c91747e132ff2e9fba Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Mon, 26 Aug 2024 10:35:24 -0400
Subject: [PATCH 24/31] fix up comment spacing

---
 tests/testthat/test-score_model_out.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index b8f8300..a77c764 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -81,7 +81,7 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, default
 
 
 test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", {
-   # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
   load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
 

From 5d45a2ac8cd9e849359b8ce877e1e05de73db0a9 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Mon, 26 Aug 2024 10:50:27 -0400
Subject: [PATCH 25/31] refactor metric validation messaging

---
 R/score_model_out.R | 54 +++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index 2d97b84..7d93901 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -165,17 +165,12 @@ get_metrics_character <- function(metrics, output_type) {
     # validate metrics
     valid_metrics <- c("ae_median", "wis", "overprediction", "underprediction", "dispersion")
     invalid_metrics <- other_metrics[!other_metrics %in% valid_metrics]
-    valid_metrics <- c(valid_metrics, "interval_coverage_XY")
-    if (length(invalid_metrics) > 0) {
-      cli::cli_abort(
-        c(
-          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
-          " `output_type` {.val {output_type}}: {.val {invalid_metrics}};",
-          " supported metrics include {.val {valid_metrics}}",
-          " where `XY` denotes the coverage level, e.g. \"interval_coverage_95\"."
-        )
-      )
-    }
+    error_if_invalid_metrics(
+      valid_metrics = c(valid_metrics, "interval_coverage_XY"),
+      invalid_metrics = invalid_metrics,
+      output_type = output_type,
+      comment = c("i" = "NOTE: `XY` denotes the coverage level, e.g. {.val interval_coverage_95}.")
+    )
 
     # assemble metric functions
     interval_metric_fns <- lapply(
@@ -194,28 +189,14 @@ get_metrics_character <- function(metrics, output_type) {
   } else if (output_type == "pmf") {
     valid_metrics <- c("log_score")
     invalid_metrics <- metrics[!metrics %in% valid_metrics]
-    if (length(invalid_metrics) > 0) {
-      cli::cli_abort(
-        c(
-          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for
-          `output_type` {.val {output_type}}: {.val {invalid_metrics}};",
-          " supported metrics include {.val {valid_metrics}}."
-        )
-      )
-    }
+    error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type)
+
     metrics <- scoringutils::metrics_nominal(select = metrics)
   } else if (output_type %in% c("median", "mean")) {
     valid_metrics <- c("ae_point", "se_point")
     invalid_metrics <- metrics[!metrics %in% valid_metrics]
-    if (length(invalid_metrics) > 0) {
-      cli::cli_abort(
-        c(
-          "`metrics` had {length(invalid_metrics)} unsupported metric{?s} for",
-          " `output_type` {.val{output_type}}: {.val {invalid_metrics}};",
-          " supported metrics include {.val {valid_metrics}}."
-        )
-      )
-    }
+    error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type)
+
     metrics <- scoringutils::metrics_point(select = metrics)
   } else {
     # we have already validated `output_type`, so this case should not be
@@ -226,3 +207,18 @@ get_metrics_character <- function(metrics, output_type) {
 
   return(metrics)
 }
+
+
+error_if_invalid_metrics <- function(valid_metrics, invalid_metrics, output_type, comment = NULL) {
+  n <- length(invalid_metrics)
+  if (n > 0) {
+    cli::cli_abort(
+      c(
+        "`metrics` had {n} unsupported metric{?s} for 
+        {.arg output_type} {.val {output_type}}: {.strong {.val {invalid_metrics}}};
+        supported metrics include {.val {valid_metrics}}.",
+        comment
+      )
+    )
+  }
+}

From ba70640318ef09da2db117ce91dcb61d199f5d33 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Mon, 26 Aug 2024 12:32:18 -0400
Subject: [PATCH 26/31] add examples for score_model_out

---
 DESCRIPTION            |  2 ++
 R/score_model_out.R    | 33 ++++++++++++++++++++++++++++-----
 man/score_model_out.Rd | 28 ++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 387e8e9..c711112 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -48,6 +48,7 @@ Imports:
     scoringutils (>= 1.2.2.9000)
 Remotes:
     epiforecasts/scoringutils,
+    hubverse-org/hubExamples,
     hubverse-org/hubUtils
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
@@ -57,5 +58,6 @@ Depends:
 LazyData: true
 Config/Needs/website: hubverse-org/hubStyle
 Suggests: 
+    hubExamples,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/R/score_model_out.R b/R/score_model_out.R
index 7d93901..b98fe8d 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -51,6 +51,32 @@
 #' only validates that a list of functions has been provided; no checks for the
 #' statistical validity of these metric functions are done.
 #'
+#' @examplesIf requireNamespace("hubExamples", quietly = TRUE)
+#' # compute WIS and interval coverage rates at 80% and 90% levels based on
+#' # quantile forecasts, summarized by the mean score for each model
+#' quantile_scores <- score_model_out(
+#'   model_out_tbl = hubExamples::forecast_outputs |>
+#'     dplyr::filter(.data[["output_type"]] == "quantile"),
+#'   target_observations = hubExamples::forecast_target_observations,
+#'   metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+#'   by = c("model_id")
+#' )
+#' quantile_scores
+#'
+#' # compute log scores based on pmf predictions for categorical targets,
+#' # summarized by the mean score for each combination of model and location.
+#' # Note: if the model_out_tbl had forecasts for multiple targets using a
+#' # pmf output_type with different bins, it would be necessary to score the
+#' # predictions for those targets separately.
+#' pmf_scores <- score_model_out(
+#'   model_out_tbl = hubExamples::forecast_outputs |>
+#'     dplyr::filter(.data[["output_type"]] == "pmf"),
+#'   target_observations = hubExamples::forecast_target_observations,
+#'   metrics = "log_score",
+#'   by = c("model_id", "location", "horizon")
+#' )
+#' head(pmf_scores)
+#'
 #' @return forecast_quantile
 #'
 #' @export
@@ -64,7 +90,7 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
   # get/validate the scoring metrics
   metrics <- get_metrics(metrics, output_type, output_type_id_order)
 
-  # assemble data for scoringutils and select output_type-specific metrics
+  # assemble data for scoringutils
   su_data <- switch(output_type,
     quantile = transform_quantile_model_out(model_out_tbl, target_observations),
     pmf = transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order),
@@ -77,10 +103,7 @@ score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
   scores <- scoringutils::score(su_data, metrics)
 
   # switch back to hubverse naming conventions for model name
-  scores <- dplyr::rename(
-    scores,
-    model_id = "model"
-  )
+  scores <- dplyr::rename(scores, model_id = "model")
 
   # if present, drop predicted and observed columns
   drop_cols <- c("predicted", "observed")
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
index 6fcd864..15fefb2 100644
--- a/man/score_model_out.Rd
+++ b/man/score_model_out.Rd
@@ -87,3 +87,31 @@ the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEva
 only validates that a list of functions has been provided; no checks for the
 statistical validity of these metric functions are done.
 }
+\examples{
+\dontshow{if (requireNamespace("hubExamples", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# compute WIS and interval coverage rates at 80\% and 90\% levels based on
+# quantile forecasts, summarized by the mean score for each model
+quantile_scores <- score_model_out(
+  model_out_tbl = hubExamples::forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile"),
+  target_observations = hubExamples::forecast_target_observations,
+  metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+  by = c("model_id")
+)
+quantile_scores
+
+# compute log scores based on pmf predictions for categorical targets,
+# summarized by the mean score for each combination of model and location.
+# Note: if the model_out_tbl had forecasts for multiple targets using a
+# pmf output_type with different bins, it would be necessary to score the
+# predictions for those targets separately.
+pmf_scores <- score_model_out(
+  model_out_tbl = hubExamples::forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "pmf"),
+  target_observations = hubExamples::forecast_target_observations,
+  metrics = "log_score",
+  by = c("model_id", "location", "horizon")
+)
+head(pmf_scores)
+\dontshow{\}) # examplesIf}
+}

From 7df18036d9fb640bfcc3f2275ebf10a3de630e05 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Wed, 28 Aug 2024 11:50:09 -0400
Subject: [PATCH 27/31] Update R/score_model_out.R

Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
---
 R/score_model_out.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index b98fe8d..c6748fa 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -4,7 +4,7 @@
 #' @param target_observations Observed 'ground truth' data to be compared to
 #' predictions
 #' @param metrics Optional list of scoring metrics to compute. See details for more.
-#' @param summarize boolean indicator of whether summaries of forecast scores
+#' @param summarize Boolean indicator of whether summaries of forecast scores
 #' should be computed. Defaults to `TRUE`.
 #' @param by character vector naming columns to summarize by. For example,
 #' specifying `by = "model_id"` (the default) will compute average scores for

From a9c27956fed4cf1b477f07fe2f2fe819b3f4b41f Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Wed, 28 Aug 2024 11:50:14 -0400
Subject: [PATCH 28/31] Update R/score_model_out.R

Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
---
 R/score_model_out.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index c6748fa..6be76e5 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -6,7 +6,7 @@
 #' @param metrics Optional list of scoring metrics to compute. See details for more.
 #' @param summarize Boolean indicator of whether summaries of forecast scores
 #' should be computed. Defaults to `TRUE`.
-#' @param by character vector naming columns to summarize by. For example,
+#' @param by Character vector naming columns to summarize by. For example,
 #' specifying `by = "model_id"` (the default) will compute average scores for
 #' each model.
 #' @param output_type_id_order For ordinal variables in pmf format, this is a

From 18ea11aa4561e80654bef58748003679145c4ca7 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 29 Aug 2024 11:51:18 -0400
Subject: [PATCH 29/31] updates to score_model_out docs

---
 R/score_model_out.R    | 29 +++++++++++++++--------------
 man/score_model_out.Rd | 15 +++++++--------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index b98fe8d..059e194 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -15,22 +15,23 @@
 #'
 #' @details If `metrics` is `NULL` (the default), this function chooses
 #' appropriate metrics based on the `output_type` contained in the `model_out_tbl`:
-#' - For `output_type == "quantile"`, we use the default metrics provided by
-#' `scoringutils::metrics_quantile()`: "wis", "overprediction", "underprediction",
-#' "dispersion", "bias", "interval_coverage_50", "interval_coverage_90",
-#' "interval_coverage_deviation", and "ae_median"
-#' - For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating
-#' that the predicted variable is a nominal variable), we use the default metrics
-#' provided by `scoringutils::metrics_nominal()`, currently just "log_score"
-#' - For `output_type == "median"`, we use "ae_point"
-#' - For `output_type == "mean"`, we use "se_point"
+#' \itemize{
+#'   \item For `output_type == "quantile"`, we use the default metrics provided by
+#' `scoringutils::metrics_quantile()`: `r names(scoringutils::metrics_quantile())`
+#'   \item For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating
+#' that the predicted variable is a nominal variable), we use the default metric
+#' provided by `scoringutils::metrics_nominal()`,
+#' `r names(scoringutils::metrics_nominal())`
+#'   \item For `output_type == "median"`, we use "ae_point"
+#'   \item For `output_type == "mean"`, we use "se_point"
+#' }
 #'
 #' Alternatively, a character vector of scoring metrics can be provided. In this
 #' case, the following options are supported:
-#' - `output_type == "median"` and `output_type == "median"`:
-#'     - "ae": absolute error of a point prediction (generally recommended for the median)
-#'     - "se": squared error of a point prediction (generally recommended for the mean)
-#' - `output_type == "quantile"`:
+#'   - `output_type == "median"` and `output_type == "mean"`:
+#'     - "ae_point": absolute error of a point prediction (generally recommended for the median)
+#'     - "se_point": squared error of a point prediction (generally recommended for the mean)
+#'   - `output_type == "quantile"`:
 #'     - "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5)
 #'     - "wis": weighted interval score (WIS) of a collection of quantile predictions
 #'     - "overprediction": The component of WIS measuring the extent to which
@@ -42,7 +43,7 @@
 #'     - "interval_coverage_XX": interval coverage at the "XX" level. For example,
 #' "interval_coverage_95" is the 95% interval coverage rate, which would be calculated
 #' based on quantiles at the probability levels 0.025 and 0.975.
-#' - `output_type == "pmf"`:
+#'   - `output_type == "pmf"`:
 #'     - "log_score": log score
 #'
 #' For more flexibility, it is also possible to directly provide a list of
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
index 15fefb2..f7f8dba 100644
--- a/man/score_model_out.Rd
+++ b/man/score_model_out.Rd
@@ -43,12 +43,11 @@ If \code{metrics} is \code{NULL} (the default), this function chooses
 appropriate metrics based on the \code{output_type} contained in the \code{model_out_tbl}:
 \itemize{
 \item For \code{output_type == "quantile"}, we use the default metrics provided by
-\code{scoringutils::metrics_quantile()}: "wis", "overprediction", "underprediction",
-"dispersion", "bias", "interval_coverage_50", "interval_coverage_90",
-"interval_coverage_deviation", and "ae_median"
+\code{scoringutils::metrics_quantile()}: wis, overprediction, underprediction, dispersion, bias, interval_coverage_50, interval_coverage_90, interval_coverage_deviation, ae_median
 \item For \code{output_type == "pmf"} and \code{output_type_id_order} is \code{NULL} (indicating
-that the predicted variable is a nominal variable), we use the default metrics
-provided by \code{scoringutils::metrics_nominal()}, currently just "log_score"
+that the predicted variable is a nominal variable), we use the default metric
+provided by \code{scoringutils::metrics_nominal()},
+log_score
 \item For \code{output_type == "median"}, we use "ae_point"
 \item For \code{output_type == "mean"}, we use "se_point"
 }
@@ -56,10 +55,10 @@ provided by \code{scoringutils::metrics_nominal()}, currently just "log_score"
 Alternatively, a character vector of scoring metrics can be provided. In this
 case, the following options are supported:
 \itemize{
-\item \code{output_type == "median"} and \code{output_type == "median"}:
+\item \code{output_type == "median"} and \code{output_type == "mean"}:
 \itemize{
-\item "ae": absolute error of a point prediction (generally recommended for the median)
-\item "se": squared error of a point prediction (generally recommended for the mean)
+\item "ae_point": absolute error of a point prediction (generally recommended for the median)
+\item "se_point": squared error of a point prediction (generally recommended for the mean)
 }
 \item \code{output_type == "quantile"}:
 \itemize{

From 587c92202633b3f2f45aeec2cbed95e881b1dd5a Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Wed, 11 Sep 2024 16:53:28 -0400
Subject: [PATCH 30/31] do not support lists of functions in score_model_out

---
 R/score_model_out.R                   | 17 +++-----
 tests/testthat/test-score_model_out.R | 60 +++++----------------------
 2 files changed, 16 insertions(+), 61 deletions(-)

diff --git a/R/score_model_out.R b/R/score_model_out.R
index adfbd28..a2c8525 100644
--- a/R/score_model_out.R
+++ b/R/score_model_out.R
@@ -3,7 +3,7 @@
 #' @param model_out_tbl Model output tibble with predictions
 #' @param target_observations Observed 'ground truth' data to be compared to
 #' predictions
-#' @param metrics Optional list of scoring metrics to compute. See details for more.
+#' @param metrics Optional character vector of scoring metrics to compute. See details for more.
 #' @param summarize Boolean indicator of whether summaries of forecast scores
 #' should be computed. Defaults to `TRUE`.
 #' @param by Character vector naming columns to summarize by. For example,
@@ -46,12 +46,6 @@
 #'   - `output_type == "pmf"`:
 #'     - "log_score": log score
 #'
-#' For more flexibility, it is also possible to directly provide a list of
-#' functions to compute the desired metrics, e.g. as would be created by one of
-#' the `scoringutils::metrics_*` methods. Note that in this case, `hubEvals`
-#' only validates that a list of functions has been provided; no checks for the
-#' statistical validity of these metric functions are done.
-#'
 #' @examplesIf requireNamespace("hubExamples", quietly = TRUE)
 #' # compute WIS and interval coverage rates at 80% and 90% levels based on
 #' # quantile forecasts, summarized by the mean score for each model
@@ -60,7 +54,7 @@
 #'     dplyr::filter(.data[["output_type"]] == "quantile"),
 #'   target_observations = hubExamples::forecast_target_observations,
 #'   metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
-#'   by = c("model_id")
+#'   by = "model_id"
 #' )
 #' quantile_scores
 #'
@@ -133,10 +127,9 @@ get_metrics <- function(metrics, output_type, output_type_id_order) {
   } else if (is.character(metrics)) {
     return(get_metrics_character(metrics, output_type))
   } else {
-    # We do a minimal preliminary check that the provided `metrics` is a list of
-    # functions, leaving further checks to scoringutils
-    checkmate::assert_list(metrics, types = "function")
-    return(metrics)
+    cli::cli_abort(
+      "{.arg metrics} must be either `NULL` or a character vector of supported metrics."
+    )
   }
 }
 
diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
index a77c764..8987ee9 100644
--- a/tests/testthat/test-score_model_out.R
+++ b/tests/testthat/test-score_model_out.R
@@ -125,54 +125,6 @@ test_that("score_model_out succeeds with valid inputs: mean output_type, charact
 })
 
 
-test_that("score_model_out succeeds with valid inputs: mean output_type, function metrics, custom by", {
-  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
-  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
-  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
-
-  act_scores <- score_model_out(
-    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
-    target_observations = forecast_target_observations,
-    metrics = scoringutils::metrics_point(),
-    by = c("model_id", "location")
-  )
-
-  exp_scores <- forecast_outputs |>
-    dplyr::filter(.data[["output_type"]] == "mean") |>
-    dplyr::left_join(
-      forecast_target_observations |>
-        dplyr::filter(.data[["output_type"]] == "mean"),
-      by = c("location", "target_end_date", "target")
-    ) |>
-    dplyr::mutate(
-      ae = abs(.data[["value"]] - .data[["observation"]]),
-      se = (.data[["value"]] - .data[["observation"]])^2,
-      pe = abs(.data[["value"]] - .data[["observation"]]) / abs(.data[["observation"]])
-    ) |>
-    dplyr::group_by(dplyr::across(dplyr::all_of(
-      c("model_id", "location")
-    ))) |>
-    dplyr::summarize(
-      ae_point = mean(.data[["ae"]]),
-      se_point = mean(.data[["se"]]),
-      ape = mean(.data[["pe"]]),
-      .groups = "drop"
-    )
-
-  # same column names, number of rows, and score values
-  expect_equal(colnames(act_scores), colnames(exp_scores))
-  expect_equal(nrow(act_scores), nrow(exp_scores))
-  merged_scores <- dplyr::full_join(
-    act_scores, exp_scores,
-    by = c("model_id", "location")
-  )
-  expect_equal(nrow(act_scores), nrow(merged_scores))
-  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
-  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
-  expect_equal(merged_scores$ape.x, merged_scores$ape.y)
-})
-
-
 test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", {
   # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
   load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
@@ -482,7 +434,17 @@ test_that("score_model_out errors when invalid metrics are requested", {
       target_observations = forecast_target_observations,
       metrics = list(5, 6, "asdf")
     ),
-    regexp = "Assertion on 'metrics' failed"
+    regexp = "`metrics` must be either `NULL` or a character vector of supported metrics."
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+      target_observations = forecast_target_observations,
+      metrics = scoringutils::metrics_point(),
+      by = c("model_id", "location")
+    ),
+    regexp = "`metrics` must be either `NULL` or a character vector of supported metrics."
   )
 })
 

From 78f729535c7a16c5c9751ce32c14e1358c9ba44a Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Wed, 11 Sep 2024 16:56:57 -0400
Subject: [PATCH 31/31] update docs

---
 man/score_model_out.Rd | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
index f7f8dba..8c42a92 100644
--- a/man/score_model_out.Rd
+++ b/man/score_model_out.Rd
@@ -19,12 +19,12 @@ score_model_out(
 \item{target_observations}{Observed 'ground truth' data to be compared to
 predictions}
 
-\item{metrics}{Optional list of scoring metrics to compute. See details for more.}
+\item{metrics}{Optional character vector of scoring metrics to compute. See details for more.}
 
-\item{summarize}{boolean indicator of whether summaries of forecast scores
+\item{summarize}{Boolean indicator of whether summaries of forecast scores
 should be computed. Defaults to \code{TRUE}.}
 
-\item{by}{character vector naming columns to summarize by. For example,
+\item{by}{Character vector naming columns to summarize by. For example,
 specifying \code{by = "model_id"} (the default) will compute average scores for
 each model.}
 
@@ -79,12 +79,6 @@ based on quantiles at the probability levels 0.025 and 0.975.
 \item "log_score": log score
 }
 }
-
-For more flexibility, it is also possible to directly provide a list of
-functions to compute the desired metrics, e.g. as would be created by one of
-the \verb{scoringutils::metrics_*} methods. Note that in this case, \code{hubEvals}
-only validates that a list of functions has been provided; no checks for the
-statistical validity of these metric functions are done.
 }
 \examples{
 \dontshow{if (requireNamespace("hubExamples", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
@@ -95,7 +89,7 @@ quantile_scores <- score_model_out(
     dplyr::filter(.data[["output_type"]] == "quantile"),
   target_observations = hubExamples::forecast_target_observations,
   metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
-  by = c("model_id")
+  by = "model_id"
 )
 quantile_scores