diff --git a/DESCRIPTION b/DESCRIPTION
index 66558a6..c711112 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,14 +38,17 @@ Description:
     basic tools for scoring hubverse forecasts.
 License: MIT + file LICENSE
 Encoding: UTF-8
-Imports: 
+Imports:
+    checkmate, 
     cli,
     dplyr,
     hubUtils,
+    purrr,
     rlang,
     scoringutils (>= 1.2.2.9000)
 Remotes:
     epiforecasts/scoringutils,
+    hubverse-org/hubExamples,
     hubverse-org/hubUtils
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
@@ -55,5 +58,6 @@ Depends:
 LazyData: true
 Config/Needs/website: hubverse-org/hubStyle
 Suggests: 
+    hubExamples,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index 857392c..41a5e71 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,3 +1,4 @@
 # Generated by roxygen2: do not edit by hand
 
+export(score_model_out)
 importFrom(rlang,.data)
diff --git a/R/score_model_out.R b/R/score_model_out.R
new file mode 100644
index 0000000..a2c8525
--- /dev/null
+++ b/R/score_model_out.R
@@ -0,0 +1,241 @@
+#' Score model output predictions with a single `output_type` against observed data
+#'
+#' @param model_out_tbl Model output tibble with predictions
+#' @param target_observations Observed 'ground truth' data to be compared to
+#' predictions
+#' @param metrics Optional character vector of scoring metrics to compute. See details for more.
+#' @param summarize Boolean indicator of whether summaries of forecast scores
+#' should be computed. Defaults to `TRUE`.
+#' @param by Character vector naming columns to summarize by. For example,
+#' specifying `by = "model_id"` (the default) will compute average scores for
+#' each model.
+#' @param output_type_id_order For ordinal variables in pmf format, this is a
+#' vector of levels for pmf forecasts, in increasing order of the levels. For
+#' all other output types, this is ignored.
+#'
+#' @details If `metrics` is `NULL` (the default), this function chooses
+#' appropriate metrics based on the `output_type` contained in the `model_out_tbl`:
+#' \itemize{
+#'   \item For `output_type == "quantile"`, we use the default metrics provided by
+#' `scoringutils::metrics_quantile()`: `r names(scoringutils::metrics_quantile())`
+#'   \item For `output_type == "pmf"` and `output_type_id_order` is `NULL` (indicating
+#' that the predicted variable is a nominal variable), we use the default metric
+#' provided by `scoringutils::metrics_nominal()`,
+#' `r names(scoringutils::metrics_nominal())`
+#'   \item For `output_type == "median"`, we use "ae_point"
+#'   \item For `output_type == "mean"`, we use "se_point"
+#' }
+#'
+#' Alternatively, a character vector of scoring metrics can be provided. In this
+#' case, the following options are supported:
+#'   - `output_type == "median"` and `output_type == "mean"`:
+#'     - "ae_point": absolute error of a point prediction (generally recommended for the median)
+#'     - "se_point": squared error of a point prediction (generally recommended for the mean)
+#'   - `output_type == "quantile"`:
+#'     - "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5)
+#'     - "wis": weighted interval score (WIS) of a collection of quantile predictions
+#'     - "overprediction": The component of WIS measuring the extent to which
+#' predictions fell above the observation.
+#'     - "underprediction": The component of WIS measuring the extent to which
+#' predictions fell below the observation.
+#'     - "dispersion":  The component of WIS measuring the dispersion of forecast
+#' distributions.
+#'     - "interval_coverage_XX": interval coverage at the "XX" level. For example,
+#' "interval_coverage_95" is the 95% interval coverage rate, which would be calculated
+#' based on quantiles at the probability levels 0.025 and 0.975.
+#'   - `output_type == "pmf"`:
+#'     - "log_score": log score
+#'
+#' @examplesIf requireNamespace("hubExamples", quietly = TRUE)
+#' # compute WIS and interval coverage rates at 80% and 90% levels based on
+#' # quantile forecasts, summarized by the mean score for each model
+#' quantile_scores <- score_model_out(
+#'   model_out_tbl = hubExamples::forecast_outputs |>
+#'     dplyr::filter(.data[["output_type"]] == "quantile"),
+#'   target_observations = hubExamples::forecast_target_observations,
+#'   metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+#'   by = "model_id"
+#' )
+#' quantile_scores
+#'
+#' # compute log scores based on pmf predictions for categorical targets,
+#' # summarized by the mean score for each combination of model and location.
+#' # Note: if the model_out_tbl had forecasts for multiple targets using a
+#' # pmf output_type with different bins, it would be necessary to score the
+#' # predictions for those targets separately.
+#' pmf_scores <- score_model_out(
+#'   model_out_tbl = hubExamples::forecast_outputs |>
+#'     dplyr::filter(.data[["output_type"]] == "pmf"),
+#'   target_observations = hubExamples::forecast_target_observations,
+#'   metrics = "log_score",
+#'   by = c("model_id", "location", "horizon")
+#' )
+#' head(pmf_scores)
+#'
+#' @return forecast_quantile
+#'
+#' @export
+score_model_out <- function(model_out_tbl, target_observations, metrics = NULL,
+                            summarize = TRUE, by = "model_id",
+                            output_type_id_order = NULL) {
+  # check that model_out_tbl has a single output_type that is supported by this package
+  # also, retrieve that output_type
+  output_type <- validate_output_type(model_out_tbl)
+
+  # get/validate the scoring metrics
+  metrics <- get_metrics(metrics, output_type, output_type_id_order)
+
+  # assemble data for scoringutils
+  su_data <- switch(output_type,
+    quantile = transform_quantile_model_out(model_out_tbl, target_observations),
+    pmf = transform_pmf_model_out(model_out_tbl, target_observations, output_type_id_order),
+    mean = transform_point_model_out(model_out_tbl, target_observations, output_type),
+    median = transform_point_model_out(model_out_tbl, target_observations, output_type),
+    NULL # default, should not happen because of the validation above
+  )
+
+  # compute scores
+  scores <- scoringutils::score(su_data, metrics)
+
+  # switch back to hubverse naming conventions for model name
+  scores <- dplyr::rename(scores, model_id = "model")
+
+  # if present, drop predicted and observed columns
+  drop_cols <- c("predicted", "observed")
+  scores <- scores[!colnames(scores) %in% drop_cols]
+
+  # if requested, summarize scores
+  if (summarize) {
+    scores <- scoringutils::summarize_scores(scores = scores, by = by)
+  }
+
+  return(scores)
+}
+
+
+#' Get metrics if user didn't specify anything; otherwise, process
+#' and validate user inputs
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics <- function(metrics, output_type, output_type_id_order) {
+  if (is.null(metrics)) {
+    return(get_metrics_default(output_type, output_type_id_order))
+  } else if (is.character(metrics)) {
+    return(get_metrics_character(metrics, output_type))
+  } else {
+    cli::cli_abort(
+      "{.arg metrics} must be either `NULL` or a character vector of supported metrics."
+    )
+  }
+}
+
+
+#' Default metrics if user didn't specify anything
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics_default <- function(output_type, output_type_id_order) {
+  metrics <- switch(output_type,
+    quantile = scoringutils::metrics_quantile(),
+    pmf = scoringutils::metrics_nominal(),
+    mean = scoringutils::metrics_point(select = "se_point"),
+    median = scoringutils::metrics_point(select = "ae_point"),
+    NULL # default
+  )
+  if (is.null(metrics)) {
+    # we have already validated `output_type`, so this case should not be
+    # triggered; this case is just double checking in case we add something new
+    # later, to ensure we update this function.
+    supported_types <- c("mean", "median", "pmf", "quantile") # nolint object_use_linter
+    cli::cli_abort(
+      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
+      hubEvals currently only supports the following types:
+      {.val {supported_types}}"
+    )
+  }
+
+  return(metrics)
+}
+
+
+#' Convert character vector of metrics to list of functions
+#'
+#' @inheritParams score_model_out
+#'
+#' @return a list of metric functions as required by scoringutils::score()
+#'
+#' @noRd
+get_metrics_character <- function(metrics, output_type) {
+  if (output_type == "quantile") {
+    # split into metrics for interval coverage and others
+    interval_metric_inds <- grepl(pattern = "^interval_coverage_[[:digit:]][[:digit:]]$", metrics)
+    interval_metrics <- metrics[interval_metric_inds]
+    other_metrics <- metrics[!interval_metric_inds]
+
+    # validate metrics
+    valid_metrics <- c("ae_median", "wis", "overprediction", "underprediction", "dispersion")
+    invalid_metrics <- other_metrics[!other_metrics %in% valid_metrics]
+    error_if_invalid_metrics(
+      valid_metrics = c(valid_metrics, "interval_coverage_XY"),
+      invalid_metrics = invalid_metrics,
+      output_type = output_type,
+      comment = c("i" = "NOTE: `XY` denotes the coverage level, e.g. {.val interval_coverage_95}.")
+    )
+
+    # assemble metric functions
+    interval_metric_fns <- lapply(
+      interval_metrics,
+      function(metric) {
+        level <- as.integer(substr(metric, 19, 20))
+        return(purrr::partial(scoringutils::interval_coverage, interval_range = level))
+      }
+    )
+    names(interval_metric_fns) <- interval_metrics
+
+    other_metric_fns <- scoringutils::metrics_quantile(select = other_metrics)
+
+    metric_fns <- c(other_metric_fns, interval_metric_fns)[metrics]
+    metrics <- metric_fns
+  } else if (output_type == "pmf") {
+    valid_metrics <- c("log_score")
+    invalid_metrics <- metrics[!metrics %in% valid_metrics]
+    error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type)
+
+    metrics <- scoringutils::metrics_nominal(select = metrics)
+  } else if (output_type %in% c("median", "mean")) {
+    valid_metrics <- c("ae_point", "se_point")
+    invalid_metrics <- metrics[!metrics %in% valid_metrics]
+    error_if_invalid_metrics(valid_metrics, invalid_metrics, output_type)
+
+    metrics <- scoringutils::metrics_point(select = metrics)
+  } else {
+    # we have already validated `output_type`, so this case should not be
+    # triggered; this case is just double checking in case we add something new
+    # later, to ensure we update this function.
+    error_if_invalid_output_type(output_type)
+  }
+
+  return(metrics)
+}
+
+
+error_if_invalid_metrics <- function(valid_metrics, invalid_metrics, output_type, comment = NULL) {
+  n <- length(invalid_metrics)
+  if (n > 0) {
+    cli::cli_abort(
+      c(
+        "`metrics` had {n} unsupported metric{?s} for 
+        {.arg output_type} {.val {output_type}}: {.strong {.val {invalid_metrics}}};
+        supported metrics include {.val {valid_metrics}}.",
+        comment
+      )
+    )
+  }
+}
diff --git a/R/validate.R b/R/validate.R
index 82e4191..07db0c7 100644
--- a/R/validate.R
+++ b/R/validate.R
@@ -66,3 +66,36 @@ validate_model_out_target_obs <- function(model_out_tbl, target_observations) {
 
   return(model_out_tbl)
 }
+
+
+#' Check that model_out_tble has a single `output_type` that is one of the
+#' `output_types` that is supported by this function.
+#'
+#' @return if valid, the output_type in model_out_tbl
+#'
+#' @noRd
+validate_output_type <- function(model_out_tbl) {
+  output_type <- unique(model_out_tbl$output_type)
+  if (length(output_type) != 1) {
+    cli::cli_abort(
+      "model_out_tbl must contain a single output_type, but it has multiple:
+      {.val {output_type}}"
+    )
+  }
+
+  error_if_invalid_output_type(output_type)
+
+  return(output_type)
+}
+
+
+error_if_invalid_output_type <- function(output_type) {
+  supported_types <- c("mean", "median", "pmf", "quantile")
+  if (!output_type %in% supported_types) {
+    cli::cli_abort(
+      "Provided `model_out_tbl` contains `output_type` {.val {output_type}};
+      hubEvals currently only supports the following types:
+      {.val {supported_types}}"
+    )
+  }
+}
diff --git a/man/score_model_out.Rd b/man/score_model_out.Rd
new file mode 100644
index 0000000..8c42a92
--- /dev/null
+++ b/man/score_model_out.Rd
@@ -0,0 +1,110 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/score_model_out.R
+\name{score_model_out}
+\alias{score_model_out}
+\title{Score model output predictions with a single \code{output_type} against observed data}
+\usage{
+score_model_out(
+  model_out_tbl,
+  target_observations,
+  metrics = NULL,
+  summarize = TRUE,
+  by = "model_id",
+  output_type_id_order = NULL
+)
+}
+\arguments{
+\item{model_out_tbl}{Model output tibble with predictions}
+
+\item{target_observations}{Observed 'ground truth' data to be compared to
+predictions}
+
+\item{metrics}{Optional character vector of scoring metrics to compute. See details for more.}
+
+\item{summarize}{Boolean indicator of whether summaries of forecast scores
+should be computed. Defaults to \code{TRUE}.}
+
+\item{by}{Character vector naming columns to summarize by. For example,
+specifying \code{by = "model_id"} (the default) will compute average scores for
+each model.}
+
+\item{output_type_id_order}{For ordinal variables in pmf format, this is a
+vector of levels for pmf forecasts, in increasing order of the levels. For
+all other output types, this is ignored.}
+}
+\value{
+forecast_quantile
+}
+\description{
+Score model output predictions with a single \code{output_type} against observed data
+}
+\details{
+If \code{metrics} is \code{NULL} (the default), this function chooses
+appropriate metrics based on the \code{output_type} contained in the \code{model_out_tbl}:
+\itemize{
+\item For \code{output_type == "quantile"}, we use the default metrics provided by
+\code{scoringutils::metrics_quantile()}: wis, overprediction, underprediction, dispersion, bias, interval_coverage_50, interval_coverage_90, interval_coverage_deviation, ae_median
+\item For \code{output_type == "pmf"} and \code{output_type_id_order} is \code{NULL} (indicating
+that the predicted variable is a nominal variable), we use the default metric
+provided by \code{scoringutils::metrics_nominal()},
+log_score
+\item For \code{output_type == "median"}, we use "ae_point"
+\item For \code{output_type == "mean"}, we use "se_point"
+}
+
+Alternatively, a character vector of scoring metrics can be provided. In this
+case, the following options are supported:
+\itemize{
+\item \code{output_type == "median"} and \code{output_type == "mean"}:
+\itemize{
+\item "ae_point": absolute error of a point prediction (generally recommended for the median)
+\item "se_point": squared error of a point prediction (generally recommended for the mean)
+}
+\item \code{output_type == "quantile"}:
+\itemize{
+\item "ae_median": absolute error of the predictive median (i.e., the quantile at probability level 0.5)
+\item "wis": weighted interval score (WIS) of a collection of quantile predictions
+\item "overprediction": The component of WIS measuring the extent to which
+predictions fell above the observation.
+\item "underprediction": The component of WIS measuring the extent to which
+predictions fell below the observation.
+\item "dispersion":  The component of WIS measuring the dispersion of forecast
+distributions.
+\item "interval_coverage_XX": interval coverage at the "XX" level. For example,
+"interval_coverage_95" is the 95\% interval coverage rate, which would be calculated
+based on quantiles at the probability levels 0.025 and 0.975.
+}
+\item \code{output_type == "pmf"}:
+\itemize{
+\item "log_score": log score
+}
+}
+}
+\examples{
+\dontshow{if (requireNamespace("hubExamples", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# compute WIS and interval coverage rates at 80\% and 90\% levels based on
+# quantile forecasts, summarized by the mean score for each model
+quantile_scores <- score_model_out(
+  model_out_tbl = hubExamples::forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile"),
+  target_observations = hubExamples::forecast_target_observations,
+  metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+  by = "model_id"
+)
+quantile_scores
+
+# compute log scores based on pmf predictions for categorical targets,
+# summarized by the mean score for each combination of model and location.
+# Note: if the model_out_tbl had forecasts for multiple targets using a
+# pmf output_type with different bins, it would be necessary to score the
+# predictions for those targets separately.
+pmf_scores <- score_model_out(
+  model_out_tbl = hubExamples::forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "pmf"),
+  target_observations = hubExamples::forecast_target_observations,
+  metrics = "log_score",
+  by = c("model_id", "location", "horizon")
+)
+head(pmf_scores)
+\dontshow{\}) # examplesIf}
+}
diff --git a/tests/testthat/test-score_model_out.R b/tests/testthat/test-score_model_out.R
new file mode 100644
index 0000000..8987ee9
--- /dev/null
+++ b/tests/testthat/test-score_model_out.R
@@ -0,0 +1,466 @@
+test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, by all", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: mean output_type, default metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: mean output_type, character metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+    target_observations = forecast_target_observations,
+    metrics = c("ae_point", "se_point"),
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "mean") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "mean"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      ae = abs(.data[["value"]] - .data[["observation"]]),
+      se = (.data[["value"]] - .data[["observation"]])^2
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      ae_point = mean(.data[["ae"]]),
+      se_point = mean(.data[["se"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+  expect_equal(merged_scores$se_point.x, merged_scores$se_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: median output_type, default metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "median"),
+    target_observations = forecast_target_observations,
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "median") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "median"),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      ae = abs(.data[["value"]] - .data[["observation"]])
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      ae_point = mean(.data[["ae"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: quantile output_type, wis and interval metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+    target_observations = forecast_target_observations,
+    metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "quantile") |>
+        dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      output_type_id = as.numeric(.data[["output_type_id"]]),
+      qs = ifelse(
+        .data[["observation"]] >= .data[["value"]],
+        .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]),
+        (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]])
+      ),
+      q_coverage_80_lower = ifelse(
+        .data[["output_type_id"]] == 0.1,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_80_upper = ifelse(
+        .data[["output_type_id"]] == 0.9,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_lower = ifelse(
+        .data[["output_type_id"]] == 0.05,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_upper = ifelse(
+        .data[["output_type_id"]] == 0.95,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      )
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      wis = 2 * mean(.data[["qs"]]),
+      interval_coverage_80 = (sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) *
+        (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1),
+      interval_coverage_90 = (sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) *
+        (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      wis = mean(.data[["wis"]]),
+      interval_coverage_80 = mean(.data[["interval_coverage_80"]], na.rm = TRUE),
+      interval_coverage_90 = mean(.data[["interval_coverage_90"]], na.rm = TRUE),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$wis.x, merged_scores$wis.y)
+  expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y)
+  expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: quantile output_type, wis/interval metrics, summarize FALSE", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+    target_observations = forecast_target_observations,
+    metrics = c("wis", "interval_coverage_80", "interval_coverage_90"),
+    summarize = FALSE
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "quantile") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "quantile") |>
+        dplyr::select(-dplyr::all_of(c("output_type", "output_type_id"))),
+      by = c("location", "target_end_date", "target")
+    ) |>
+    dplyr::mutate(
+      output_type_id = as.numeric(.data[["output_type_id"]]),
+      qs = ifelse(
+        .data[["observation"]] >= .data[["value"]],
+        .data[["output_type_id"]] * (.data[["observation"]] - .data[["value"]]),
+        (1 - .data[["output_type_id"]]) * (.data[["value"]] - .data[["observation"]])
+      ),
+      q_coverage_80_lower = ifelse(
+        .data[["output_type_id"]] == 0.1,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_80_upper = ifelse(
+        .data[["output_type_id"]] == 0.9,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_lower = ifelse(
+        .data[["output_type_id"]] == 0.05,
+        .data[["observation"]] >= .data[["value"]],
+        NA_real_
+      ),
+      q_coverage_90_upper = ifelse(
+        .data[["output_type_id"]] == 0.95,
+        .data[["observation"]] <= .data[["value"]],
+        NA_real_
+      )
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+    ))) |>
+    dplyr::summarize(
+      wis = 2 * mean(.data[["qs"]]),
+      interval_coverage_80 = as.logical((sum(.data[["q_coverage_80_lower"]], na.rm = TRUE) == 1) *
+                                          (sum(.data[["q_coverage_80_upper"]], na.rm = TRUE) == 1)),
+      interval_coverage_90 = as.logical((sum(.data[["q_coverage_90_lower"]], na.rm = TRUE) == 1) *
+                                          (sum(.data[["q_coverage_90_upper"]], na.rm = TRUE) == 1)),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location", "reference_date", "horizon", "target_end_date", "target")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$wis.x, merged_scores$wis.y)
+  expect_equal(merged_scores$interval_coverage_80.x, merged_scores$interval_coverage_80.y)
+  expect_equal(merged_scores$interval_coverage_90.x, merged_scores$interval_coverage_90.y)
+})
+
+
+test_that("score_model_out succeeds with valid inputs: nominal pmf output_type, default metrics, custom by", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  act_scores <- score_model_out(
+    model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "pmf"),
+    target_observations = forecast_target_observations,
+    by = c("model_id", "location")
+  )
+
+  exp_scores <- forecast_outputs |>
+    dplyr::filter(.data[["output_type"]] == "pmf") |>
+    dplyr::left_join(
+      forecast_target_observations |>
+        dplyr::filter(.data[["output_type"]] == "pmf") |>
+        dplyr::select(-dplyr::all_of(c("output_type"))),
+      by = c("location", "target_end_date", "target", "output_type_id")
+    ) |>
+    dplyr::filter(.data[["observation"]] == 1) |>
+    dplyr::mutate(
+      log_score = -1 * log(.data[["value"]])
+    ) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(
+      c("model_id", "location")
+    ))) |>
+    dplyr::summarize(
+      log_score = mean(.data[["log_score"]]),
+      .groups = "drop"
+    )
+
+  # same column names, number of rows, and score values
+  expect_equal(colnames(act_scores), colnames(exp_scores))
+  expect_equal(nrow(act_scores), nrow(exp_scores))
+  merged_scores <- dplyr::full_join(
+    act_scores, exp_scores,
+    by = c("model_id", "location")
+  )
+  expect_equal(nrow(act_scores), nrow(merged_scores))
+  expect_equal(merged_scores$ae_point.x, merged_scores$ae_point.y)
+})
+
+
+test_that("score_model_out errors when model_out_tbl has multiple output_types", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs,
+      target_observations = forecast_target_observations
+    ),
+    regexp = "model_out_tbl must contain a single output_type, but it has multiple"
+  )
+})
+
+
+test_that("score_model_out errors when invalid interval levels are requested", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_5"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_100"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "interval_level_XY"
+    ),
+    regexp = "unsupported metric"
+  )
+})
+
+
+test_that("score_model_out errors when invalid metrics are requested", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = "log_score"
+    ),
+    regexp = "unsupported metric"
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      target_observations = forecast_target_observations,
+      metrics = list(5, 6, "asdf")
+    ),
+    regexp = "`metrics` must be either `NULL` or a character vector of supported metrics."
+  )
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "mean"),
+      target_observations = forecast_target_observations,
+      metrics = scoringutils::metrics_point(),
+      by = c("model_id", "location")
+    ),
+    regexp = "`metrics` must be either `NULL` or a character vector of supported metrics."
+  )
+})
+
+
+
+test_that("score_model_out errors when an unsupported output_type is provided", {
+  # Forecast data from HubExamples: <https://hubverse-org.github.io/hubExamples/reference/forecast_data.html>
+  load(test_path("testdata/forecast_outputs.rda")) # sets forecast_outputs
+  load(test_path("testdata/forecast_target_observations.rda")) # sets forecast_target_observations
+
+  expect_error(
+    score_model_out(
+      model_out_tbl = forecast_outputs |> dplyr::filter(.data[["output_type"]] == "cdf"),
+      target_observations = forecast_target_observations,
+      metrics = "log_score"
+    ),
+    regexp = "only supports the following types"
+  )
+})