diff --git a/.Rbuildignore b/.Rbuildignore index 04a17fdfa..b84863208 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,3 +18,4 @@ ^\.devcontainer$ ^CODE_OF_CONDUCT\.md$ ^inst/manuscript/output$ +^CRAN-SUBMISSION$ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..e42ea84ee --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,21 @@ + + +## Description + +This PR closes #. + +[Describe the changes that you made in this pull request.] + +## Checklist + +- [ ] My PR is based on a package issue and I have explicitly linked it. +- [ ] I have included the target issue or issues in the PR title as follows: *issue-number*: PR title +- [ ] I have tested my changes locally. +- [ ] I have added or updated unit tests where necessary. +- [ ] I have updated the documentation if required. +- [ ] I have built the package locally and run rebuilt docs using roxygen2. +- [ ] My code follows the established coding standards and I have run `lintr::lint_package()` to check for style issues introduced by my changes. +- [ ] I have added a news item linked to this PR. +- [ ] I have reviewed CI checks for this PR and addressed them as far as I am able. + + diff --git a/.gitignore b/.gitignore index 5bf8eecd3..6afe7eefa 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ docs ..bfg-report/ .DS_Store .vscode +README.html diff --git a/DESCRIPTION b/DESCRIPTION index 95e581a07..9c0e27c66 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,8 +41,8 @@ Description: Scoring metrics can be used either through a convenient data.frame format, or can be applied as individual functions in a vector / matrix format. All functionality has been implemented with a focus on performance and is - robustly tested. Find more information about scoringutils in the - accompanying paper (Bosse et al., 2022) . + robustly tested. Find more information about the package in the + accompanying paper (). License: MIT + file LICENSE Encoding: UTF-8 LazyData: true diff --git a/NEWS.md b/NEWS.md index e95602810..59f6ee4f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -39,15 +39,26 @@ The update introduces breaking changes. If you want to keep using the older vers - `score()` now calls `na.omit()` on the data, instead of only removing rows with missing values in the columns `observed` and `predicted`. This is because `NA` values in other columns can also mess up e.g. grouping of forecasts according to the unit of a single forecast. - added documentation for the return value of `summarise_scores()`. - Removed abs_error and squared_error from the package in favour of `Metrics::ae` and `Metrics::se`. +- Added unit tests for `interval_coverage_quantile()` and `interval_coverage_dev_quantile()` in order to make sure that the functions provide the correct warnings when insufficient quantiles are provided. + +# scoringutils 1.2.2 + +## Package updates +- `scoringutils` now depends on R 3.6. The change was made since packages `testthat` and `lifecycle`, which are used in `scoringutils` now require R 3.6. We also updated the Github action CI check to work with R 3.6 now. +- Added a new PR template with a checklist of things to be included in PRs to facilitate the development and review process + +## Bug fixes +- Fixes a bug with `set_forecast_unit()` where the function only workded with a data.table, but not a data.frame as an input. +- The metrics table in the vignette [Details on the metrics implemented in `scoringutils`](https://epiforecasts.io/scoringutils/articles/metric-details.html) had duplicated entries. This was fixed by removing the duplicated rows. # scoringutils 1.2.1 ## Package updates - This minor update fixes a few issues related to gh actions and the vignettes displayed at epiforecasts.io/scoringutils. It - - gets rid of the preferably package in _pkgdown.yml. The theme had a toggle between light and dark theme that didn't work properly - - updates the gh pages deploy action to v4 and also cleans up files when triggered - - introduces a gh action to automatically render the Readme from Readme.Rmd - - removes links to vignettes that have been renamed + - Gets rid of the preferably package in _pkgdown.yml. The theme had a toggle between light and dark theme that didn't work properly + - Updates the gh pages deploy action to v4 and also cleans up files when triggered + - Introduces a gh action to automatically render the Readme from Readme.Rmd + - Removes links to vignettes that have been renamed # scoringutils 1.2.0 diff --git a/R/available_forecasts.R b/R/available_forecasts.R index 0ee143542..9c84142ba 100644 --- a/R/available_forecasts.R +++ b/R/available_forecasts.R @@ -29,7 +29,9 @@ #' @export #' @keywords check-forecasts #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' get_forecast_counts(example_quantile, #' by = c("model", "target_type") diff --git a/R/data.R b/R/data.R index c9bf77109..0b67543b9 100644 --- a/R/data.R +++ b/R/data.R @@ -20,7 +20,7 @@ #' \item{horizon}{forecast horizon in weeks} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_quantile" @@ -47,7 +47,7 @@ #' \item{horizon}{forecast horizon in weeks} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_point" @@ -74,7 +74,7 @@ #' \item{sample_id}{id for the corresponding sample} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_continuous" @@ -101,7 +101,7 @@ #' \item{sample_id}{id for the corresponding sample} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_integer" @@ -134,7 +134,7 @@ #' \item{predicted}{predicted value} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_binary" @@ -159,7 +159,7 @@ #' \item{horizon}{forecast horizon in weeks} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_quantile_forecasts_only" @@ -181,7 +181,7 @@ #' \item{location_name}{name of the country for which a prediction was made} #' } # nolint start -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint end "example_truth_only" diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R index af02ef30c..7426fd8d7 100644 --- a/R/metrics-quantile.R +++ b/R/metrics-quantile.R @@ -241,7 +241,8 @@ interval_coverage_quantile <- function(observed, predicted, quantile, range = 50 if (!all(necessary_quantiles %in% quantile)) { warning( "To compute the interval coverage for a range of ", range, - "%, the quantiles ", necessary_quantiles, " are required. Returning `NA`." + "%, the quantiles `", toString(necessary_quantiles), + "` are required. Returning `NA`." ) return(NA) } diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index 525d1bca6..86e17c95b 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -51,7 +51,9 @@ #' @author Johannes Bracher, \email{johannes.bracher@@kit.edu} #' @keywords scoring #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' scores <- score(example_quantile) #' pairwise <- pairwise_comparison(scores, by = "target_type") diff --git a/R/pit.R b/R/pit.R index 9d2793a2b..ab5a580d1 100644 --- a/R/pit.R +++ b/R/pit.R @@ -62,7 +62,9 @@ #' @seealso [pit()] #' @importFrom stats runif #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' ## continuous predictions #' observed <- rnorm(20, mean = 1:20) diff --git a/R/plot.R b/R/plot.R index d93f36f1e..928ab7760 100644 --- a/R/plot.R +++ b/R/plot.R @@ -24,7 +24,9 @@ #' @examples #' library(ggplot2) #' library(magrittr) # pipe operator -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' scores <- score(example_quantile) %>% #' summarise_scores(by = c("model", "target_type")) %>% @@ -577,11 +579,12 @@ make_na <- make_NA #' @importFrom data.table dcast #' @export #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' data_coverage <- add_coverage(example_quantile) #' summarised <- summarise_scores(data_coverage, by = c("model", "range")) #' plot_interval_coverage(summarised) - plot_interval_coverage <- function(scores, colour = "model") { ## overall model calibration - empirical interval coverage @@ -830,7 +833,9 @@ plot_pairwise_comparison <- function(comparison_result, #' @importFrom stats density #' @return vector with the scoring values #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' # PIT histogram in vector based format #' observed <- rnorm(30, mean = 1:30) diff --git a/R/score.R b/R/score.R index 380123416..f490a1d75 100644 --- a/R/score.R +++ b/R/score.R @@ -32,7 +32,9 @@ #' @importFrom stats na.omit #' @examples #' library(magrittr) # pipe operator -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' #' validated <- as_forecast(example_quantile) #' score(validated) %>% diff --git a/R/summarise_scores.R b/R/summarise_scores.R index ac01e017f..7c7d526b4 100644 --- a/R/summarise_scores.R +++ b/R/summarise_scores.R @@ -28,7 +28,9 @@ #' to the names of the columns of the original data specified in `by` or #' `across` using the `fun` passed to `summarise_scores()`. #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' \dontshow{ +#' data.table::setDTthreads(2) # restricts number of cores used on CRAN +#' } #' library(magrittr) # pipe operator #' \dontrun{ #' scores <- score(example_continuous) diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 000000000..1c6f431e7 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,9 @@ +.onAttach <- function(libname, pkgname) { + packageStartupMessage( + "Note: scoringutils is currently undergoing major development changes ", + "(with an update planned for the first quarter of 2024). We would very ", + "much appreciate your opinions and feedback on what should be included in ", + "this major update: ", + "https://github.com/epiforecasts/scoringutils/discussions/333" + ) +} diff --git a/README.Rmd b/README.Rmd index 0e4644d62..8de076935 100644 --- a/README.Rmd +++ b/README.Rmd @@ -30,7 +30,7 @@ library(knitr) The `scoringutils` package provides a collection of metrics and proper scoring rules and aims to make it simple to score probabilistic forecasts against observed values. -You can find additional information and examples in the papers [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) [Scoring epidemiological forecasts on transformed scales](https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1) as well as the Vignettes ([Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics implemented](https://epiforecasts.io/scoringutils/articles/metric-details.html) and [Scoring forecasts directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html)). +You can find additional information and examples in the papers [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) [Scoring epidemiological forecasts on transformed scales](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1011393) as well as the Vignettes ([Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics implemented](https://epiforecasts.io/scoringutils/articles/metric-details.html) and [Scoring forecasts directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html)). The `scoringutils` package offers convenient automated forecast evaluation through the function `score()`. The function operates on data.frames (it uses `data.table` internally for speed and efficiency) and can easily be integrated in a workflow based on `dplyr` or `data.table`. It also provides experienced users with a set of reliable lower-level scoring metrics operating on vectors/matrices they can build upon in other applications. In addition it implements a wide range of flexible plots designed to cover many use cases. diff --git a/README.md b/README.md index 62867f902..b1d3dcd4e 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ You can find additional information and examples in the papers [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) [Scoring epidemiological forecasts on transformed -scales](https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1) +scales](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1011393) as well as the Vignettes ([Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics @@ -143,17 +143,20 @@ example_quantile %>% digits = 2 ) %>% kable() +#> Some rows containing NA values may be removed. This is fine if not unexpected. +#> Some rows containing NA values may be removed. This is fine if not unexpected. +#> Some rows containing NA values may be removed. This is fine if not unexpected. ``` -| model | target_type | wis | overprediction | underprediction | dispersion | bias | coverage_50 | coverage_90 | coverage_deviation | ae_median | relative_skill | scaled_rel_skill | -|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|------------:|------------:|-------------------:|----------:|---------------:|-----------------:| -| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 | -| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 | -| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 | -| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 | -| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 | -| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 | -| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 | +| model | target_type | wis | overprediction | underprediction | dispersion | bias | interval_coverage_50 | interval_coverage_90 | interval_coverage_deviation | ae_median | relative_skill | scaled_rel_skill | +|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|---------------------:|---------------------:|----------------------------:|----------:|---------------:|-----------------:| +| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 | +| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 | +| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 | +| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 | +| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 | +| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 | +| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 | `scoringutils` contains additional functionality to transform forecasts, to summarise scores at different levels, to visualise them, and to @@ -175,6 +178,7 @@ example_quantile %>% score %>% summarise_scores(by = c("model", "target_type", "scale")) %>% head() +#> Some rows containing NA values may be removed. This is fine if not unexpected. #> model target_type scale wis overprediction #> 1: EuroCOVIDhub-ensemble Cases natural 11550.70664 3650.004755 #> 2: EuroCOVIDhub-baseline Cases natural 22090.45747 7702.983696 @@ -182,20 +186,20 @@ example_quantile %>% #> 4: EuroCOVIDhub-ensemble Deaths natural 41.42249 7.138247 #> 5: EuroCOVIDhub-baseline Deaths natural 159.40387 65.899117 #> 6: UMass-MechBayes Deaths natural 52.65195 8.978601 -#> underprediction dispersion bias coverage_50 coverage_90 -#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 0.8046875 -#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 0.8203125 -#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 0.7890625 -#> 4: 4.103261 30.18099 0.07265625 0.8750000 1.0000000 -#> 5: 2.098505 91.40625 0.33906250 0.6640625 1.0000000 -#> 6: 16.800951 26.87239 -0.02234375 0.4609375 0.8750000 -#> coverage_deviation ae_median -#> 1: -0.10230114 17707.95312 -#> 2: -0.11437500 32080.48438 -#> 3: -0.06963068 21530.69531 -#> 4: 0.20380682 53.13281 -#> 5: 0.12142045 233.25781 -#> 6: -0.02488636 78.47656 +#> underprediction dispersion bias interval_coverage_50 +#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 +#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 +#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 +#> 4: 4.103261 30.18099 0.07265625 0.8750000 +#> 5: 2.098505 91.40625 0.33906250 0.6640625 +#> 6: 16.800951 26.87239 -0.02234375 0.4609375 +#> interval_coverage_90 interval_coverage_deviation ae_median +#> 1: 0.8046875 -0.10230114 17707.95312 +#> 2: 0.8203125 -0.11437500 32080.48438 +#> 3: 0.7890625 -0.06963068 21530.69531 +#> 4: 1.0000000 0.20380682 53.13281 +#> 5: 1.0000000 0.12142045 233.25781 +#> 6: 0.8750000 -0.02488636 78.47656 ``` ## Citation diff --git a/data/metrics.rda b/data/metrics.rda index e378f5dde..64a94e6cf 100644 Binary files a/data/metrics.rda and b/data/metrics.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index bf7cca877..61d2d80b9 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,16 +1,18 @@ AJ +al +Bosse Bracher CMD COVID CRPS Camacho -Comput Cori DSS Dawid ECDC Eggo EpiNow +et EuroCOVIDhub Gneiting Höhle @@ -44,9 +46,7 @@ facetted facetting frac ggplot -implict jss -matriced medRxiv metacran miscalibrated @@ -58,6 +58,7 @@ pval pvalues rel scoringRules +scoringutils standalone u underprediction diff --git a/inst/create-metric-tables.R b/inst/create-metric-tables.R index b3c5a994d..56accd9d4 100644 --- a/inst/create-metric-tables.R +++ b/inst/create-metric-tables.R @@ -49,7 +49,7 @@ log_score <- list( `C` = r"($\checkmark$)", `B` = r"($\checkmark$)", `Q` = r"($-$)", - `Properties` = "Proper scoring rule, smaller is better, only evaluates predictive density at observed value (local), penalises over-confidence severely, susceptible to outliers", + `Properties` = "Proper scoring rule, smaller is better, equals negative log of the predictive density at observed value (local), penalises over-confidence severely, susceptible to outliers", `References` = "" ) @@ -212,7 +212,7 @@ relative_skill <- list( scaled_relative_skill <- list( `Metric` = "Scaled relative skill", - `Name` = list("scaled_rel_skill"), + `Name` = "scaled_rel_skill", `Functions` = r"(score(), pairwise_comparison())", `D` = r"($\sim$)", `C` = r"($\sim$)", @@ -267,7 +267,7 @@ crps <- list( `Explanation` = r"(The crps is a proper scoring rule that generalises the absolute error to probabilistic forecasts. It measures the 'distance' of the predictive distribution to the observed data-generating distribution. The CRPS is given as $$\text{CRPS}(F, y) = \int_{-\infty}^\infty \left( F(x) - 1(x \geq y) \right)^2 dx,$$ where y is the observed value and F the CDF of predictive distribution. Often An alternative representation is used: - $$ \text{CRPS}(F, y) = \frac{1}{2} \mathbb{E}_{F} |X - X'| - \mathbb{E}_P |X - y|,$$ where $X$ and $X'$ are independent realisations from the predictive distributions $F$ with finite first moment and $y$ is the observed value. In this representation we can simply replace $X$ and $X'$ by samples sum over all possible combinations to obtain the CRPS. + $$ \text{CRPS}(F, y) = \frac{1}{2} \mathbb{E}_{F} |X - X'| - \mathbb{E}_P |X - y|,$$ where $X$ and $X'$ are independent realisations from the predictive distributions $F$ with finite first moment and $y$ is the observed value. In this representation we can simply replace $X$ and $X'$ by samples and sum over all possible combinations to obtain the CRPS. For integer-valued forecasts, the RPS is given as $$ \text{RPS}(F, y) = \sum_{x = 0}^\infty (F(x) - 1(x \geq y))^2. $$ @@ -278,14 +278,14 @@ crps <- list( log_score <- list( `Metric` = "Log score", - `Explanation` = r"(The Log score is a proper scoring rule that is simply computed as the log of the predictive density evaluated at the observed value. It is given as - $$ \text{log score} = \log f(y), $$ + `Explanation` = r"(The Log score is a proper scoring rule that is computed as the negative log of the predictive density evaluated at the observed value. It is given as + $$ \text{log score} = -\log f(y), $$ where $f$ is the predictive density function and y is the observed value. For integer-valued forecasts, the log score can be computed as - $$ \text{log score} = \log p_y, $$ + $$ \text{log score} = -\log p_y, $$ where $p_y$ is the probability assigned to outcome p by the forecast F. **Usage and caveats**: - Larger values are better, but sometimes the sign is reversed. The log score is sensitive to outliers, as individual negative log score contributions quickly can become very large if the event falls in the tails of the predictive distribution, where $f(y)$ (or $p_y$) is close to zero. Whether or not that is desirable depends ont the application. In scoringutils, the log score cannot be used for integer-valued forecasts, as the implementation requires a predictive density. In contrast to the crps, the log score is a local scoring rule: it's value only depends only on the probability that was assigned to the actual outcome. This property may be desirable for inferential purposes, for example in a Bayesian context (Winkler et al., 1996). In settings where forecasts inform decision making, it may be more appropriate to score forecasts based on the entire predictive distribution.)" + Smaller values are better, but sometimes the sign is reversed. The log score is sensitive to outliers, as individual log score contributions can become very large if the event falls in a range of the predictive distribution where $f(y)$ (or $p_y$) is close to zero. Whether or not that is desirable depends ont the application. In scoringutils, the log score cannot be used for integer-valued forecasts, as the implementation requires a predictive density. In contrast to the crps, the log score is a local scoring rule: it's value only depends only on the probability that was assigned to the actual outcome. This property may be desirable for inferential purposes, for example in a Bayesian context (Winkler et al., 1996). In settings where forecasts inform decision making, it may be more appropriate to score forecasts based on the entire predictive distribution.)" ) wis <- list( diff --git a/inst/metrics-overview/metrics-detailed.Rda b/inst/metrics-overview/metrics-detailed.Rda index 69d0b6363..cf8f96832 100644 Binary files a/inst/metrics-overview/metrics-detailed.Rda and b/inst/metrics-overview/metrics-detailed.Rda differ diff --git a/man/example_binary.Rd b/man/example_binary.Rd index 47797b8cd..efd78cb9c 100644 --- a/man/example_binary.Rd +++ b/man/example_binary.Rd @@ -19,7 +19,7 @@ A data frame with 346 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_binary diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd index 354ebc5d6..17386704d 100644 --- a/man/example_continuous.Rd +++ b/man/example_continuous.Rd @@ -20,7 +20,7 @@ A data frame with 13,429 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_continuous diff --git a/man/example_integer.Rd b/man/example_integer.Rd index c3c97183a..6589e3ce3 100644 --- a/man/example_integer.Rd +++ b/man/example_integer.Rd @@ -20,7 +20,7 @@ A data frame with 13,429 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_integer diff --git a/man/example_point.Rd b/man/example_point.Rd index 1eb734b76..5ce43c8fa 100644 --- a/man/example_point.Rd +++ b/man/example_point.Rd @@ -19,7 +19,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_point diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd index 2582907e9..dd048ec8a 100644 --- a/man/example_quantile.Rd +++ b/man/example_quantile.Rd @@ -20,7 +20,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd index d789ed1e0..30782740f 100644 --- a/man/example_quantile_forecasts_only.Rd +++ b/man/example_quantile_forecasts_only.Rd @@ -18,7 +18,7 @@ A data frame with 7,581 rows and 9 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile_forecasts_only diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd index f8ae05afa..ce32e8a33 100644 --- a/man/example_truth_only.Rd +++ b/man/example_truth_only.Rd @@ -15,7 +15,7 @@ A data frame with 140 rows and 5 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_truth_only diff --git a/man/get_forecast_counts.Rd b/man/get_forecast_counts.Rd index 1ed71aac7..52ea524e7 100644 --- a/man/get_forecast_counts.Rd +++ b/man/get_forecast_counts.Rd @@ -34,7 +34,9 @@ number of forecasts per model and location). This is useful to determine whether there are any missing forecasts. } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} get_forecast_counts(example_quantile, by = c("model", "target_type") diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd index d30be1197..f95b29bbc 100644 --- a/man/pairwise_comparison.Rd +++ b/man/pairwise_comparison.Rd @@ -63,7 +63,9 @@ The implementation of the permutation test follows the function Andrea Riebler and Michaela Paul. } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} scores <- score(example_quantile) pairwise <- pairwise_comparison(scores, by = "target_type") diff --git a/man/pit_sample.Rd b/man/pit_sample.Rd index 11b934343..bfcf1bcc6 100644 --- a/man/pit_sample.Rd +++ b/man/pit_sample.Rd @@ -76,7 +76,9 @@ In this context it should be noted, though, that uniformity of the PIT is a necessary but not sufficient condition of calibration. } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} ## continuous predictions observed <- rnorm(20, mean = 1:20) diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd index 10857763d..315a6972e 100644 --- a/man/plot_interval_coverage.Rd +++ b/man/plot_interval_coverage.Rd @@ -21,7 +21,9 @@ ggplot object with a plot of interval coverage Plot interval coverage } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} data_coverage <- add_coverage(example_quantile) summarised <- summarise_scores(data_coverage, by = c("model", "range")) plot_interval_coverage(summarised) diff --git a/man/plot_pit.Rd b/man/plot_pit.Rd index 3bc6ddd1a..7aafebed2 100644 --- a/man/plot_pit.Rd +++ b/man/plot_pit.Rd @@ -32,7 +32,9 @@ Make a simple histogram of the probability integral transformed values to visually check whether a uniform distribution seems likely. } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} # PIT histogram in vector based format observed <- rnorm(30, mean = 1:30) diff --git a/man/plot_score_table.Rd b/man/plot_score_table.Rd index 9984e8000..acf07cd7b 100644 --- a/man/plot_score_table.Rd +++ b/man/plot_score_table.Rd @@ -31,7 +31,9 @@ Plots a coloured table of summarised scores obtained using \examples{ library(ggplot2) library(magrittr) # pipe operator -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} scores <- score(example_quantile) \%>\% summarise_scores(by = c("model", "target_type")) \%>\% diff --git a/man/score.Rd b/man/score.Rd index b995b847f..79edbb24d 100644 --- a/man/score.Rd +++ b/man/score.Rd @@ -117,7 +117,9 @@ necessary, 'protected columns' like "predicted" or "observed" are retained. \examples{ library(magrittr) # pipe operator -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} validated <- as_forecast(example_quantile) score(validated) \%>\% diff --git a/man/scoringutils-package.Rd b/man/scoringutils-package.Rd index c6efc6415..a44ad90ef 100644 --- a/man/scoringutils-package.Rd +++ b/man/scoringutils-package.Rd @@ -6,9 +6,9 @@ \alias{scoringutils-package} \title{scoringutils: Utilities for Scoring and Assessing Predictions} \description{ -Provides a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) \doi{10.1198/016214506000001437}, Jordan, A., Krüger, F., & Lerch, S. (2019) \doi{10.18637/jss.v090.i12}) within a consistent framework for evaluation, comparison and visualisation of forecasts. In addition to proper scoring rules, functions are provided to assess bias, sharpness and calibration (Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, Rosalind M. Eggo, W. John Edmunds (2019) \doi{10.1371/journal.pcbi.1006785}) of forecasts. Several types of predictions (e.g. binary, discrete, continuous) which may come in different formats (e.g. forecasts represented by predictive samples or by quantiles of the predictive distribution) can be evaluated. Scoring metrics can be used either through a convenient data.frame format, or can be applied as individual functions in a vector / matrix format. All functionality has been implemented with a focus on performance and is robustly tested. Find more information about scoringutils in the accompanying paper (Bosse et al., 2022) \href{https://arxiv.org/abs/2205.07090v1}{arXiv:2205.07090v1}. +Provides a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) \doi{10.1198/016214506000001437}, Jordan, A., Krüger, F., & Lerch, S. (2019) \doi{10.18637/jss.v090.i12}) within a consistent framework for evaluation, comparison and visualisation of forecasts. In addition to proper scoring rules, functions are provided to assess bias, sharpness and calibration (Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, Rosalind M. Eggo, W. John Edmunds (2019) \doi{10.1371/journal.pcbi.1006785}) of forecasts. Several types of predictions (e.g. binary, discrete, continuous) which may come in different formats (e.g. forecasts represented by predictive samples or by quantiles of the predictive distribution) can be evaluated. Scoring metrics can be used either through a convenient data.frame format, or can be applied as individual functions in a vector / matrix format. All functionality has been implemented with a focus on performance and is robustly tested. Find more information about the package in the accompanying paper (\doi{10.48550/arXiv.2205.07090}). -Provides a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) \doi{10.1198/016214506000001437}, Jordan, A., Krüger, F., & Lerch, S. (2019) \doi{10.18637/jss.v090.i12}) within a consistent framework for evaluation, comparison and visualisation of forecasts. In addition to proper scoring rules, functions are provided to assess bias, sharpness and calibration (Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, Rosalind M. Eggo, W. John Edmunds (2019) \doi{10.1371/journal.pcbi.1006785}) of forecasts. Several types of predictions (e.g. binary, discrete, continuous) which may come in different formats (e.g. forecasts represented by predictive samples or by quantiles of the predictive distribution) can be evaluated. Scoring metrics can be used either through a convenient data.frame format, or can be applied as individual functions in a vector / matrix format. All functionality has been implemented with a focus on performance and is robustly tested. Find more information about scoringutils in the accompanying paper (Bosse et al., 2022) \href{https://arxiv.org/abs/2205.07090v1}{arXiv:2205.07090v1}. +Provides a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) \doi{10.1198/016214506000001437}, Jordan, A., Krüger, F., & Lerch, S. (2019) \doi{10.18637/jss.v090.i12}) within a consistent framework for evaluation, comparison and visualisation of forecasts. In addition to proper scoring rules, functions are provided to assess bias, sharpness and calibration (Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, Rosalind M. Eggo, W. John Edmunds (2019) \doi{10.1371/journal.pcbi.1006785}) of forecasts. Several types of predictions (e.g. binary, discrete, continuous) which may come in different formats (e.g. forecasts represented by predictive samples or by quantiles of the predictive distribution) can be evaluated. Scoring metrics can be used either through a convenient data.frame format, or can be applied as individual functions in a vector / matrix format. All functionality has been implemented with a focus on performance and is robustly tested. Find more information about the package in the accompanying paper (\doi{10.48550/arXiv.2205.07090}). } \seealso{ Useful links: diff --git a/man/summarise_scores.Rd b/man/summarise_scores.Rd index 9d6ce674a..addd49b96 100644 --- a/man/summarise_scores.Rd +++ b/man/summarise_scores.Rd @@ -45,7 +45,9 @@ to the names of the columns of the original data specified in \code{by} or Summarise scores as produced by \code{\link[=score]{score()}} } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} library(magrittr) # pipe operator \dontrun{ scores <- score(example_continuous) diff --git a/tests/testthat/_snaps/summarise_scores.md b/tests/testthat/_snaps/summarise_scores.md new file mode 100644 index 000000000..1ecd8e86b --- /dev/null +++ b/tests/testthat/_snaps/summarise_scores.md @@ -0,0 +1,9 @@ +# summarise_scores() metric is deprecated + + Code + x <- summarise_scores(scores, by = "model", metric = "auto", relative_skill = TRUE) + Condition + Warning: + The `metric` argument of `summarise_scores()` is deprecated as of scoringutils 1.1.0. + i Please use the `relative_skill_metric` argument instead. + diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 4fadd0921..e035f09c2 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -2,6 +2,7 @@ library(ggplot2, quietly = TRUE) library(data.table) suppressMessages(library(magrittr)) +data.table::setDTthreads(2) # restricts number of cores used on CRAN metrics_no_cov <- rules_quantile( exclude = c("interval_coverage_50", "interval_coverage_90", diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index d5f83aeb9..7d20c03ba 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -91,3 +91,20 @@ test_that("function get_forecast_unit() and set_forecast_unit() work together", expect_equal(fu_set, fu_get) }) + +test_that("set_forecast_unit() works on input that's not a data.table", { + df <- data.frame( + a = 1:2, + b = 2:3, + c = 3:4 + ) + expect_equal( + colnames(set_forecast_unit(df, c("a", "b"))), + c("a", "b") + ) + # apparently it also works on a matrix... good to know :) + expect_equal( + names(set_forecast_unit(as.matrix(df), "a")), + "a" + ) +}) diff --git a/tests/testthat/test-metrics-quantile.R b/tests/testthat/test-metrics-quantile.R index b941a95aa..a6faf051c 100644 --- a/tests/testthat/test-metrics-quantile.R +++ b/tests/testthat/test-metrics-quantile.R @@ -601,6 +601,17 @@ test_that("interval_coverage_quantile rejects wrong inputs", { ) }) +test_that("interval_coverage_quantile throws a warning when a required quantile is not available", { + dropped_quantile_pred <- predicted[, -4] + dropped_quantiles <- quantile[-4] + expect_warning( + interval_coverage_quantile( + observed, dropped_quantile_pred, dropped_quantiles, range = 50 + ), + "To compute the interval coverage for a range of 50%, the quantiles `0.25, 0.75` are required. Returning `NA`" + ) +}) + # ============================================================================ # # `interval_coverage_dev_quantile` ===================================== # @@ -617,6 +628,12 @@ test_that("interval_coverage_dev_quantile works", { interval_coverage_dev_quantile(observed, predicted, quantile), manual ) + expect_warning( + interval_coverage_dev_quantile( + observed, predicted, c(quantile[-4], 0.76) + ), + "To compute inteval coverage deviation, all quantiles must form central symmetric prediction intervals. Missing quantiles: 0.24, 0.75. Returning `NA`." + ) }) diff --git a/vignettes/metric-details.Rmd b/vignettes/metric-details.Rmd index c2e0fab95..384bb71f1 100644 --- a/vignettes/metric-details.Rmd +++ b/vignettes/metric-details.Rmd @@ -42,7 +42,10 @@ data$C <- replace(data$C) data$B <- replace(data$B) data$Q <- replace(data$Q) -data[, 1:6] %>% +data <- data[, 1:6] %>% + unique() + +data %>% kbl(format = "html", escape = FALSE, align = "lccccl", diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd index e5e0e9aa8..5c35cd492 100644 --- a/vignettes/scoringutils.Rmd +++ b/vignettes/scoringutils.Rmd @@ -19,6 +19,8 @@ library(magrittr) library(data.table) library(ggplot2) library(knitr) +# number of threads used for data.table computations, update as needed +data.table::setDTthreads(2) ``` The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score probabilistic forecasts against observed values. You can find more information in the paper [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) as well as the [Metrics-Vignette](https://epiforecasts.io/scoringutils/articles/metric-details.html) and the [Scoring forecasts directly Vignette](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html).