diff --git a/NEWS.md b/NEWS.md
index 060b7563..a4899aa6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -88,6 +88,7 @@ of our [original](https://doi.org/10.48550/arXiv.2205.07090) `scoringutils` pape
 ### Function changes
 - `bias_quantile()` changed the way it handles forecasts where the median is missing: The median is now imputed by linear interpolation between the innermost quantiles. Previously, we imputed the median by simply taking the mean of the innermost quantiles.
 - In contrast to the previous `correlation` function, `get_correlations` doesn't round correlations by default. Instead, `plot_correlations` now has a `digits` argument that allows users to round correlations before plotting them. Alternatively, using `dplyr`, you could call something like `mutate(correlations, across(where(is.numeric), \(x) signif(x, digits = 2)))` on the output of `get_correlations`. 
+- `wis()` now errors by default if not all quantile levels form valid prediction intervals and returns `NA` if there are missing values. Previously, `na.rm` was set to `TRUE` by default, which could lead to unexpected results, if users are not aware of this.
 
 ### Internal package updates
 - The deprecated `..density..` was replaced with `after_stat(density)` in ggplot calls.
diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R
index 096f6406..ddb14f35 100644
--- a/R/metrics-quantile.R
+++ b/R/metrics-quantile.R
@@ -180,10 +180,31 @@ wis <- function(observed,
                 separate_results = FALSE,
                 weigh = TRUE,
                 count_median_twice = FALSE,
-                na.rm = TRUE) {
+                na.rm = FALSE) {
   assert_input_quantile(observed, predicted, quantile_level)
   reformatted <- quantile_to_interval(observed, predicted, quantile_level)
 
+  # check that all quantile levels form valid prediction intervals
+  interval_ranges <- get_range_from_quantile(
+    quantile_level[quantile_level != 0.5]
+  )
+  complete_intervals <-
+    duplicated(interval_ranges) | duplicated(interval_ranges, fromLast = TRUE)
+  if (!all(complete_intervals) && !isTRUE(na.rm)) {
+    #nolint start: keyword_quote_linter object_usage_linter
+    incomplete <- quantile_level[quantile_level != 0.5][!complete_intervals]
+    cli_abort(
+      c(
+        "!" = "Not all quantile levels specified form symmetric prediction
+        intervals.
+        The following quantile levels miss a corresponding lower/upper bound:
+        {.val {incomplete}}.
+        You can drop incomplete prediction intervals using `na.rm = TRUE`."
+      )
+    )
+    #nolint end
+  }
+
   assert_logical(separate_results, len = 1)
   assert_logical(weigh, len = 1)
   assert_logical(count_median_twice, len = 1)
diff --git a/man/wis.Rd b/man/wis.Rd
index f5785fa8..5452dd23 100644
--- a/man/wis.Rd
+++ b/man/wis.Rd
@@ -14,7 +14,7 @@ wis(
   separate_results = FALSE,
   weigh = TRUE,
   count_median_twice = FALSE,
-  na.rm = TRUE
+  na.rm = FALSE
 )
 
 dispersion_quantile(observed, predicted, quantile_level, ...)
diff --git a/tests/testthat/test-class-forecast-quantile.R b/tests/testthat/test-class-forecast-quantile.R
index 87d746bf..ecff751f 100644
--- a/tests/testthat/test-class-forecast-quantile.R
+++ b/tests/testthat/test-class-forecast-quantile.R
@@ -300,23 +300,29 @@ test_that("score() works even if only some quantiles are missing", {
     ))
   )
 
-
   # asymmetric intervals
   asymm <- example_quantile[!quantile_level > 0.6]
+  metrics <- get_metrics(example_quantile)
+  metrics$wis <- purrr::partial(wis, na.rm = TRUE)
   expect_warning(
     expect_warning(
-      score_a <- score(asymm) %>% summarise_scores(by = "model"),
+      score_a <- score(asymm, metrics = metrics) %>%
+        summarise_scores(by = "model"),
       "Computation for `interval_coverage_50` failed."
     ),
     "Computation for `interval_coverage_90` failed."
   )
 
+  # expect a failure with the regular wis wihtout ma.rm=TRUE
+  expect_warning(
+    score(asymm, metrics = c(wis = wis)),
+    "Not all quantile levels specified form symmetric prediction intervals."
+  )
+
   # check that the result is equal to a case where we discard the entire
   # interval in terms of WIS
   inner <- example_quantile[quantile_level %in% c(0.4, 0.45, 0.5, 0.55, 0.6)]
-  score_b <- score(inner, get_metrics(
-    inner, exclude = c("interval_coverage_50", "interval_coverage_90")
-  )) %>%
+  score_b <- score(inner, metrics = c(wis = wis)) %>%
     summarise_scores(by = "model")
   expect_equal(
     score_a$wis,