outline for over-reporting score

openpharma · Feb 27, 2024 · ae94573 · ae94573
1 parent 2f4f43a
commit ae94573
Show file tree

Hide file tree

Showing 6 changed files with 262 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: simaerep
 Title: Find Clinical Trial Sites Under-Reporting Adverse Events
-Version: 0.4.4
+Version: 0.4.3.900
 Authors@R: c(
     person(given = "Bjoern",
            family = "Koneswarakantha",

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
-# simaerep 0.4.4
+# simaerep 0.5.0
 - allow flexible AE rates in data simulations
 - add vignette comparing simaerep to gsm performance
+- add over-reporting probability
 - fix dplyr warnings
 - fix warnings around ggplot and cowplot
 

diff --git a/R/simaerep.R b/R/simaerep.R
@@ -347,6 +347,7 @@ get_visit_med75 <- function(df_pat,
 #' @export
 eval_sites <- function(df_sim_sites,
                         method = "BH",
+                        under_only = TRUE,
                         ...) {
 
 
@@ -406,6 +407,17 @@ eval_sites <- function(df_sim_sites,
         prob_low_adj = p.adjust(.data$prob_low, method = method),
         prob_low_prob_ur = 1 - .data$prob_low_adj
       )
+
+    if (! under_only) {
+      df_out <- df_out %>%
+        mutate(prob_high = 1 - prob_low) %>%
+        arrange(.data$study_id, .data$prob_high) %>%
+        mutate(
+          prob_high_adj = p.adjust(.data$prob_high, method = method),
+          prob_high_prob_or = 1 - .data$prob_high_adj
+        )
+    }
+
   }
 
   return(ungroup(df_out))

diff --git a/R/simaerep_plot.R b/R/simaerep_plot.R
@@ -440,7 +440,9 @@ plot_study <- function(df_visit,
                        study,
                        df_al = NULL,
                        n_sites = 16,
-                       pval = FALSE) {
+                       pval = FALSE,
+                       prob_col = "prob_low_prob_ur") {
+
   # TODO: parametrize scores, fix legend
 
   df_visit <- check_df_visit(df_visit)
@@ -496,18 +498,18 @@ plot_study <- function(df_visit,
   # ordered sites -------------------------------------------------------------
 
   n_site_ur_gr_0p5 <- df_eval %>%
-    filter(.data$prob_low_prob_ur > 0.5) %>%
+    filter(.data[[prob_col]] > 0.5) %>%
     nrow()
 
   if (n_site_ur_gr_0p5 > 0) {
     sites_ordered <- df_eval %>%
-      arrange(.data$study_id, desc(.data$prob_low_prob_ur)) %>%
-      filter(.data$prob_low_prob_ur > 0.5) %>%
+      arrange(.data$study_id, desc(.data[[prob_col]])) %>%
+      filter(.data[[prob_col]] > 0.5) %>%
       head(n_sites) %>%
       .$site_number
   } else {
     sites_ordered <- df_eval %>%
-      arrange(.data$study_id, desc(.data$prob_low_prob_ur)) %>%
+      arrange(.data$study_id, desc(.data[[prob_col]])) %>%
       head(6) %>%
       .$site_number
   }
@@ -564,16 +566,16 @@ plot_study <- function(df_visit,
   # define score cut-offs + labels----------------------------------------------
 
   palette <- RColorBrewer::brewer.pal(9, "Blues")[c(3, 5, 7, 9)]
-  breaks <- c(0, 0.5, 0.75, 0.95, ifelse(max(df_eval$prob_low_prob_ur, na.rm = TRUE) > 0.95,
-                                    max(df_eval$prob_low_prob_ur, na.rm = TRUE) + 0.1,
+  breaks <- c(0, 0.5, 0.75, 0.95, ifelse(max(df_eval[[prob_col]], na.rm = TRUE) > 0.95,
+                                    max(df_eval[[prob_col]], na.rm = TRUE) + 0.1,
                                     NA)
               )
 
   breaks <- breaks[! is.na(breaks)]
 
   df_eval <- df_eval %>%
     mutate(
-      prob_cut = cut(.data$prob_low_prob_ur, breaks = breaks, include.lowest = TRUE),
+      prob_cut = cut(.data[[prob_col]], breaks = breaks, include.lowest = TRUE),
       color_prob_cut = palette[as.numeric(.data$prob_cut)]
     )
 
@@ -617,19 +619,19 @@ plot_study <- function(df_visit,
       , by = c("study_id", "site_number")
     ) %>%
     left_join(df_alert, by = c("study_id", "site_number")) %>%
-    select(c(
+    select(any_of(c(
       "study_id",
       "site_number",
       "visit",
       "mean_ae",
       "n_pat",
       "n_pat_with_med75",
-      "prob_low_prob_ur",
+      prob_col,
       "prob_cut",
       "color_prob_cut",
       "pval_prob_ur",
       "alert_level_site"
-    )) %>%
+    ))) %>%
     mutate(
       site_number = fct_relevel(.data$site_number, sites_ordered),
       color_alert_level = case_when(
@@ -739,7 +741,7 @@ plot_study <- function(df_visit,
               x = 0.2 * max_visit,
               y = 0.9 * max_ae,
               na.rm = TRUE) +
-    geom_label(aes(label = paste(round(prob_low_prob_ur, 3) * 100, "%"),
+    geom_label(aes(label = paste(round(.data[[prob_col]], 3) * 100, "%"),
                    color = color_prob_cut),
               data = df_label,
               x = 0.8 * max_visit,

diff --git a/man/simaerep.Rd b/man/simaerep.Rd
diff --git a/vignettes/over.Rmd b/vignettes/over.Rmd
@@ -0,0 +1,231 @@
+---
+title: "Over-Reporting Probability"
+output:
+  html_document:
+    toc: true
+    toc_depth: 3
+    toc_float: true
+    number_sections: true
+    code_folding: show
+    collapse: false
+editor_options:
+  chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, cache = FALSE, fig.width = 10)
+```
+
+# Load
+```{r}
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(tidyr))
+suppressPackageStartupMessages(library(simaerep))
+suppressPackageStartupMessages(library(ggExtra))
+```
+
+# Introduction
+
+{siamerep} was originally created to detect under-reporting of AEs and therefore no over-reporting probability was calculated. Nevertheless {simaerep} can theoretically be used to simulate all kinds of subject-based clinical events, for some such as issues over-reporting can represent a quality issue.  With the recent release `0.5.0` we have added the option to calculate an over-reporting probability score.
+
+# Data Set
+
+We simulate a standard data set with a high number of sites, patients, visits and events to ensure that most of our dimensions will be normally distributed. We do not add any over- or under-reporting sites at this point.
+
+```{r}
+df_visit <- sim_test_data_study(
+ n_pat = 10000,
+ n_sites = 1000,
+ frac_site_with_ur = 0,
+ max_visit_mean = 100,
+ max_visit_sd = 1,
+ ae_per_visit_mean = 5
+)
+
+df_visit$study_id <- "A"
+```
+
+# Run {simaerep}
+
+in order to add the over-reporting probability score we need to set the parameter `under_only = FALSE`.
+
+```{r}
+system.time(
+  aerep_def <- simaerep(
+    df_visit,
+    param_sim_sites = list(under_only = TRUE),
+    param_eval_sites = list(under_only = TRUE)
+  )
+)
+```
+
+The original setting skips the simulation for all sites that do have more AEs than the study average.
+
+```{r}
+system.time(
+  aerep_ovr <- simaerep(
+    df_visit,
+    param_sim_sites = list(under_only = FALSE),
+    param_eval_sites = list(under_only = FALSE)
+  )
+)
+```
+
+The new parameter calculates the probability of a site getting `a lower or equal` average AE count for the site visit_med75 for every site, regardless of how its initial value compares to the study average. The calculation only takes a few seconds longer than the default setting.
+
+In the evaluation data frame we have three more columns available now.
+
+```{r}
+setdiff(colnames(aerep_ovr$df_eval), colnames(aerep_def$df_eval))
+```
+
+# Analyze
+
+## Probability getting a lower or equal AE count
+
+
+```{r}
+
+cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75", "prob_low")
+
+p <- bind_rows(
+  select(
+      aerep_ovr$df_eval,
+      all_of(cols)
+    ) %>%
+    mutate(type = "with over-reporting"),
+    select(
+      aerep_def$df_eval,
+      all_of(cols)
+    ) %>%
+    mutate(type = "default")
+  ) %>%
+  ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = prob_low, color = type)) +
+    geom_point(alpha = 0.5) +
+    theme(legend.position = "bottom") +
+    scale_color_manual(values = c("gold", "blue")) +
+    labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
+
+ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
+```
+
+
+We can see that we have a gap for the default setting in the generated probabilities. The values filling the gap can be interpreted as the probability of having a `higher` site average than originally observed.
+
+We need to be aware that values in the simulation `equal` to the site mean will count towards the under-reporting probability. This can potentially lead to artifacts with low AE counts.
+
+## Over-Reporting
+
+We can add the over-reporting probability as (1- under-reporting probability)
+
+```{r}
+
+cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75")
+
+
+p <- bind_rows(
+  select(
+      aerep_ovr$df_eval,
+      all_of(cols),
+      value = "prob_low"
+    ) %>%
+    mutate(type = "new under-reporting"),
+  select(
+      aerep_ovr$df_eval,
+      all_of(cols),
+      value = "prob_high"
+    ) %>%
+    mutate(type = "new over-reporting"),
+    select(
+      aerep_def$df_eval,
+      all_of(cols),
+      value = "prob_low"
+    ) %>%
+    mutate(type = "default under-reporting")
+  ) %>%
+  ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = value, color = type)) +
+    geom_point(alpha = 0.25) +
+    theme(legend.position = "bottom") +
+    scale_color_manual(values = c("gold", "purple", "blue")) +
+    labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
+
+ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
+```
+
+## Multiplicity Correction
+
+The multiplicity correction dampens the signal, avoiding false positives that are the result of chance.
+
+```{r}
+
+cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75")
+
+
+p <- bind_rows(
+  select(
+      aerep_ovr$df_eval,
+      all_of(cols),
+      value = "prob_low_prob_ur"
+    ) %>%
+    mutate(type = "new under-reporting"),
+  select(
+      aerep_ovr$df_eval,
+      all_of(cols),
+      value = "prob_high_prob_or"
+    ) %>%
+    mutate(type = "new over-reporting"),
+    select(
+      aerep_def$df_eval,
+      all_of(cols),
+      value = "prob_low_prob_ur"
+    ) %>%
+    mutate(type = "default under-reporting")
+  ) %>%
+  ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = value, color = type)) +
+    geom_point(alpha = 0.25) +
+    theme(legend.position = "bottom") +
+    scale_color_manual(values = c("gold", "purple", "blue")) +
+    labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
+
+ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
+```
+
+# Simulating Over-Reporting
+
+We can simulate under-reporting by supplying a negative ratio for `ur_rate`
+
+```{r}
+df_visit <- sim_test_data_study(
+ frac_site_with_ur = 0.05,
+ ur_rate = - 0.5,
+)
+
+df_visit$study_id <- "A"
+
+
+distinct(df_visit, site_number, is_ur, ae_per_visit_mean)
+```
+
+
+```{r}
+aerep <- simaerep(
+    df_visit,
+    param_sim_sites = list(under_only = FALSE),
+    param_eval_sites = list(under_only = FALSE)
+  )
+```
+
+```{r}
+aerep$df_eval %>%
+  select(site_number, mean_ae_site_med75, mean_ae_study_med75, prob_low_prob_ur, prob_high_prob_or)
+```
+
+We can plot over-reporting by changing setting `prob_col = "prob_high_prob_or"`.
+
+```{r}
+plot(aerep, prob_col = "prob_high_prob_or")
+
+plot(aerep, prob_col = "prob_low_prob_ur")
+
+```
+