Merge pull request #64 from openpharma/dev

#62 and snowflake compat…
openpharma · Sep 25, 2024 · 5bdd121 · 5bdd121
2 parents acac522 + ec5c545
commit 5bdd121
Show file tree

Hide file tree

Showing 11 changed files with 50 additions and 201 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,7 +8,6 @@ export("%>%")
 export(aggr_duplicated_visits)
 export(check_df_visit)
 export(eval_sites)
-export(eval_sites_deprecated)
 export(exp_implicit_missing_visits)
 export(get_config)
 export(get_ecd_values)

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 - https://github.com/openpharma/simaerep/issues/59
 - https://github.com/openpharma/simaerep/issues/16
 - update vignettes
+- https://github.com/openpharma/simaerep/issues/62
 
 # simaerep 0.5.0
 - allow flexible AE rates in data simulations

diff --git a/R/S3_simaerep.R b/R/S3_simaerep.R
@@ -76,6 +76,7 @@ validate_simaerep <- function(x) {
 #'  Default: TRUE.
 #'@param inframe Logical, only table operations to be used; does not require
 #'  visit_med75. Compatible with dbplyr supported database backends.
+#'@param mult_corr Logical, multiplicity correction, Default: TRUE
 #'@param param_site_aggr List of parameters passed to [site_aggr()]. Default:
 #'  list(method = "med75_adj", min_pat_pool = 0.2).
 #'@param param_sim_sites List of parameters passed to [sim_sites()]. Default:
@@ -133,6 +134,7 @@ simaerep <- function(df_visit,
                      visit_med75 = TRUE,
                      inframe = FALSE,
                      progress = TRUE,
+                     mult_corr = TRUE,
                      param_site_aggr = list(
                        method = "med75_adj",
                        min_pat_pool = 0.2
@@ -168,6 +170,10 @@ simaerep <- function(df_visit,
   param_sim_sites$under_only <- under_only
   param_eval_sites$under_only <- under_only
 
+  if (! mult_corr) {
+    param_eval_sites$method <- NA
+  }
+
   if (visit_med75 && ! inframe) {
 
     if (r != param_sim_sites$r) {

diff --git a/R/inframe.R b/R/inframe.R
@@ -137,7 +137,23 @@ sim_inframe <- function(df_visit, r = 1000, df_site = NULL) {
     mutate(
       # we generate a random number between 1 and the maximum number
       # of eligible patients
-      rnd = runif(n()),
+      rnd = runif(n())
+    )
+
+  if (inherits(df_sim_prep, "tbl_Snowflake")) {
+    # snowflake RANDOM() works differently than other backends and returns large
+    # positive and negative integers. We normalize by using the min and max values
+    # of those 64bit integers. Source for integer values is ChatGPT.
+    # for a range from -5 to 5: 4 -> 0.0  -4 -> 0.1, with underlying formula
+
+    df_sim_prep <- df_sim_prep %>%
+      mutate(
+        rnd = (.data$rnd - (-9223372036854775808)) / (9223372036854775807 - (-9223372036854775808))
+      )
+  }
+
+  df_sim_prep <- df_sim_prep %>%
+    mutate(
       # transform to values between 1 and max number of patients
       rnd_rwn = floor(.data$rnd * .data$max_rwn) + 1
     )

diff --git a/R/simaerep.R b/R/simaerep.R
@@ -380,9 +380,6 @@ get_visit_med75 <- function(df_pat,
 #' df_eval <- eval_sites(df_sim_sites)
 #' df_eval
 #'
-#' # use deprecated method  -------
-#' df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100)
-#' df_eval
 #' @rdname eval_sites
 #' @seealso \code{\link[simaerep]{site_aggr}},
 #' \code{\link[simaerep]{sim_sites}},
@@ -393,13 +390,6 @@ eval_sites <- function(df_sim_sites,
                         under_only = TRUE,
                         ...) {
 
-
-  if (is.null(method)) {
-    warning("using deprecated method for probability adjustment")
-    df_out <- eval_sites_deprecated(df_sim_sites, ...)
-    return(df_out)
-  }
-
   df_out <- df_sim_sites
 
   if ("pval" %in% colnames(df_out)) {
@@ -458,6 +448,10 @@ eval_sites <- function(df_sim_sites,
 #'@keywords internal
 p_adjust <- function(df, col, suffix, method = "BH") {
 
+  if (is.na(method) || is.null(method) || method %in% c("None", "none")) {
+    return(df)
+  }
+
   if (inherits(df, "data.frame")) {
 
   col_adj <- paste0(col, "_adj")
@@ -515,111 +509,6 @@ get_any_na <- function(df, col) {
   })
 }
 
-#' @title Evaluate sites.
-#' @description Correct under-reporting probabilities by the expected number of
-#'   false positives (fp). This has been deprecated in favor of more conventional
-#'   methods available via \code{\link[stats]{p.adjust}}.
-#' @param df_sim_sites dataframe generated by [sim_sites()][sim_sites()]
-#' @param r_sim_sites integer, number of repeats for bootstrap resampling for site
-#'   simulation, needed for zero probability correction for fp calculation, Default: 1000
-#' @return dataframe with the following columns:
-#' \describe{
-#'   \item{**study_id**}{study identification}
-#'   \item{**site_number**}{site identification}
-#'   \item{**visit_med75**}{median(max(visit)) * 0.75}
-#'   \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
-#'   \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
-#'   \item{**pval**}{p-value as returned by `poisson.test`}
-#'   \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower}
-#'   \item{**n_site**}{number of study sites}
-#'   \item{**pval_n_detected**}{sites with the same p-value or lower}
-#'   \item{**pval_fp**}{expected number of fp, pval * n_site}
-#'   \item{**pval_p_vs_fp_ratio**}{odds under-reporting as p/fp, poisson.test (use as benchmark)}
-#'   \item{**pval_prob_ur**}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)}
-#'   \item{**prob_low_n_detected**}{sites with same bootstrapped probability or lower}
-#'   \item{**prob_low_fp**}{expected number of fp, prob_lower * n_site}
-#'   \item{**prob_low_p_vs_fp_ratio**}{odds under-reporting as p/fp, bootstrapped (use)}
-#'   \item{**prob_low_prob_ur**}{probability under-reporting as 1 - fp/p, bootstrapped (use)}
-#' }
-#' @details If by chance expected number of
-#'   false positives (fp) is greater than the total number of positives (p) we
-#'   set p_vs_fp_ratio = 1 and prob_ur = 0.
-#' @examples
-#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
-#'     frac_site_with_ur = 0.4, ur_rate = 0.6)
-#'
-#' df_visit$study_id <- "A"
-#' df_site <- site_aggr(df_visit)
-#'
-#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
-#'
-#' df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100)
-#' df_eval
-#' @rdname eval_sites_deprecated
-#' @seealso [site_aggr()][site_aggr()], [sim_sites()][sim_sites()]
-#' @export
-eval_sites_deprecated <- function(df_sim_sites,
-                                  r_sim_sites) {
-
-
-  df_out <- df_sim_sites
-
-  if ("pval" %in% names(df_out)) {
-
-    if (anyNA(df_out$pval)) {
-      warning("pval column contains NA")
-    }
-
-    df_out <- df_out %>%
-      group_by(.data$study_id) %>%
-      arrange(.data$study_id, .data$pval) %>%
-      mutate(
-        n_site = n(),
-        min_pval = min(.data$pval[.data$pval > 0]),
-        # we need to use ties.method max because there can be more than one site with a
-        # specific p value
-        pval_n_detected = rank(.data$pval, ties.method = "max", na.last = "keep"),
-        pval_fp = .data$pval * .data$n_site,
-        pval_fp = ifelse(.data$pval_fp == 0, .data$min_pval / 10, .data$pval_fp),
-        pval_p_vs_fp_ratio = .data$pval_n_detected / .data$pval_fp,
-        pval_p_vs_fp_ratio = ifelse(is.na(.data$pval_p_vs_fp_ratio) & ! is.na(.data$pval),
-                                    0, .data$pval_p_vs_fp_ratio),
-        # it is possible to get ratios lower than one if p < expected fp
-        # this can happen by chance and is meaningless
-        pval_p_vs_fp_ratio = ifelse(.data$pval_p_vs_fp_ratio < 1, 1, .data$pval_p_vs_fp_ratio),
-        pval_prob_ur = 1 - 1 / .data$pval_p_vs_fp_ratio
-      ) %>%
-      select(- "min_pval")
-  }
-
-  if ("prob_low" %in% names(df_out)) {
-
-    if (anyNA(df_out$pval)) {
-      warning("prob_lower column contains NA")
-    }
-
-    df_out <- df_out %>%
-      group_by(.data$study_id) %>%
-      arrange(.data$study_id, .data$prob_low) %>%
-      mutate(
-        n_site = n(),
-        # we need to use ties.method max because there can be more than one site with a
-        # specific prob_low value
-        prob_low_n_detected = rank(.data$prob_low, ties.method = "max", na.last = "keep"),
-        prob_low_fp = .data$prob_low * .data$n_site,
-        prob_low_fp = ifelse(.data$prob_low_fp == 0, 1 / r_sim_sites / 10, .data$prob_low_fp),
-        prob_low_p_vs_fp_ratio = .data$prob_low_n_detected / .data$prob_low_fp,
-        prob_low_p_vs_fp_ratio = ifelse(is.na(.data$prob_low_p_vs_fp_ratio) & ! is.na(.data$prob_low),
-                                        0, .data$prob_low_p_vs_fp_ratio),
-        # it is possible to get ratios lower than one if p < expected fp
-        # this can happen by chance and is meaningless
-        prob_low_p_vs_fp_ratio = ifelse(.data$prob_low_p_vs_fp_ratio < 1, 1, .data$prob_low_p_vs_fp_ratio),
-        prob_low_prob_ur = 1 - 1 / .data$prob_low_p_vs_fp_ratio
-      )
-  }
-
-  return(ungroup(df_out))
-}
 
 
 

diff --git a/man/eval_sites.Rd b/man/eval_sites.Rd
diff --git a/man/eval_sites_deprecated.Rd b/man/eval_sites_deprecated.Rd
diff --git a/man/simaerep.Rd b/man/simaerep.Rd
diff --git a/tests/testthat/test_S3_simaerep.R b/tests/testthat/test_S3_simaerep.R
@@ -90,3 +90,21 @@ test_that("plot.simaerep throws error when original visit data cannot be retriev
     "ggplot"
   )
 })
+
+test_that("simaerep() with mult_corr = FALSE must not return adjusted probabilities", {
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+  expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, inframe = TRUE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE, inframe = TRUE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+  expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))
+
+})
diff --git a/tests/testthat/test_eval_sites.R b/tests/testthat/test_eval_sites.R
@@ -52,23 +52,3 @@ test_that("eval_sites() - check column names of returned data frame", {
       all()
   )
 })
-
-test_that(
-  paste(
-    "eval_sites() with method = NULL uses deprecated method for p value correction.",
-    "Must throw warning.",
-    "Stats from sim_sites() must negatively correlate with p/fp ratio.",
-    "p/fp ratio must not be smaller 1."
-  ), {
-
-  expect_warning({
-    df_eval <- eval_sites(df_sim_sites_test, method = NULL, r_sim_sites = 100)
-  })
-
-  expect_true(cor(df_eval$prob_low, df_eval$prob_low_p_vs_fp_ratio) < 0)
-  expect_true(cor(df_eval$pval, df_eval$pval_p_vs_fp_ratio) < 0)
-
-  expect_true(! any(df_eval$prob_low_p_vs_fp_ratio < 1))
-  expect_true(! any(df_eval$pval_p_vs_fp_ratio < 1))
-
-})
diff --git a/vignettes/performance.Rmd b/vignettes/performance.Rmd
@@ -369,7 +369,7 @@ in past experiments.
 - multiplicity correction imposes a penalty on the true positive rate
 
 > This observation was already made by the Boeringer Ingelheim Team during the evaluation of {simaerep}. We can 
-now reporducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
+now reproducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
 very realistic under-reporting probabilities.
 
 - {simaerep} outperforms simpler methods such as funnel plot and box plot outlier detection.