From ee1966a0f48a735953a7c579098e30d1e20ff82a Mon Sep 17 00:00:00 2001
From: Bjoern Koneswarakantha <erblast2001@gmx.de>
Date: Wed, 25 Sep 2024 16:06:26 +0200
Subject: [PATCH 1/2] https://github.com/openpharma/simaerep/issues/62 and
 snowflake compatibility

---
 NAMESPACE                         |   1 -
 NEWS.md                           |   1 +
 R/S3_simaerep.R                   |   6 ++
 R/inframe.R                       |  18 ++++-
 R/simaerep.R                      | 119 +-----------------------------
 man/eval_sites.Rd                 |   3 -
 man/eval_sites_deprecated.Rd      |  60 ---------------
 man/simaerep.Rd                   |   3 +
 tests/testthat/test_S3_simaerep.R |  18 +++++
 tests/testthat/test_eval_sites.R  |  19 -----
 vignettes/performance.Rmd         |   2 +-
 11 files changed, 50 insertions(+), 200 deletions(-)
 delete mode 100644 man/eval_sites_deprecated.Rd

diff --git a/NAMESPACE b/NAMESPACE
index c11f5e9..8912a99 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,7 +8,6 @@ export("%>%")
 export(aggr_duplicated_visits)
 export(check_df_visit)
 export(eval_sites)
-export(eval_sites_deprecated)
 export(exp_implicit_missing_visits)
 export(get_config)
 export(get_ecd_values)
diff --git a/NEWS.md b/NEWS.md
index a0d3a95..f15706c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,7 @@
 - https://github.com/openpharma/simaerep/issues/59
 - https://github.com/openpharma/simaerep/issues/16
 - update vignettes
+- https://github.com/openpharma/simaerep/issues/62
 
 # simaerep 0.5.0
 - allow flexible AE rates in data simulations
diff --git a/R/S3_simaerep.R b/R/S3_simaerep.R
index 57d6d1c..6893c2e 100644
--- a/R/S3_simaerep.R
+++ b/R/S3_simaerep.R
@@ -76,6 +76,7 @@ validate_simaerep <- function(x) {
 #'  Default: TRUE.
 #'@param inframe Logical, only table operations to be used; does not require
 #'  visit_med75. Compatible with dbplyr supported database backends.
+#'@param mult_corr Logical, multiplicity correction, Default: TRUE
 #'@param param_site_aggr List of parameters passed to [site_aggr()]. Default:
 #'  list(method = "med75_adj", min_pat_pool = 0.2).
 #'@param param_sim_sites List of parameters passed to [sim_sites()]. Default:
@@ -133,6 +134,7 @@ simaerep <- function(df_visit,
                      visit_med75 = TRUE,
                      inframe = FALSE,
                      progress = TRUE,
+                     mult_corr = TRUE,
                      param_site_aggr = list(
                        method = "med75_adj",
                        min_pat_pool = 0.2
@@ -168,6 +170,10 @@ simaerep <- function(df_visit,
   param_sim_sites$under_only <- under_only
   param_eval_sites$under_only <- under_only
 
+  if (! mult_corr) {
+    param_eval_sites$method <- NA
+  }
+
   if (visit_med75 && ! inframe) {
 
     if (r != param_sim_sites$r) {
diff --git a/R/inframe.R b/R/inframe.R
index 0778505..a5a75d9 100644
--- a/R/inframe.R
+++ b/R/inframe.R
@@ -137,7 +137,23 @@ sim_inframe <- function(df_visit, r = 1000, df_site = NULL) {
     mutate(
       # we generate a random number between 1 and the maximum number
       # of eligible patients
-      rnd = runif(n()),
+      rnd = runif(n())
+    )
+
+  if (inherits(df_sim_prep, "tbl_Snowflake")) {
+    # snowflake RANDOM() works differently than other backends and returns large
+    # positive and negative integers. We normalize by using the min and max values
+    # of those 64bit integers. Source for integer values is ChatGPT.
+    # for a range from -5 to 5: 4 -> 0.0  -4 -> 0.1, with underlying formula
+
+    df_sim_prep <- df_sim_prep %>%
+      mutate(
+        rnd = (.data$rnd - (-9223372036854775808)) / (9223372036854775807 - (-9223372036854775808))
+      )
+  }
+
+  df_sim_prep <- df_sim_prep %>%
+    mutate(
       # transform to values between 1 and max number of patients
       rnd_rwn = floor(.data$rnd * .data$max_rwn) + 1
     )
diff --git a/R/simaerep.R b/R/simaerep.R
index e0d12af..e77f76a 100644
--- a/R/simaerep.R
+++ b/R/simaerep.R
@@ -380,9 +380,6 @@ get_visit_med75 <- function(df_pat,
 #' df_eval <- eval_sites(df_sim_sites)
 #' df_eval
 #'
-#' # use deprecated method  -------
-#' df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100)
-#' df_eval
 #' @rdname eval_sites
 #' @seealso \code{\link[simaerep]{site_aggr}},
 #' \code{\link[simaerep]{sim_sites}},
@@ -393,13 +390,6 @@ eval_sites <- function(df_sim_sites,
                         under_only = TRUE,
                         ...) {
 
-
-  if (is.null(method)) {
-    warning("using deprecated method for probability adjustment")
-    df_out <- eval_sites_deprecated(df_sim_sites, ...)
-    return(df_out)
-  }
-
   df_out <- df_sim_sites
 
   if ("pval" %in% colnames(df_out)) {
@@ -458,6 +448,10 @@ eval_sites <- function(df_sim_sites,
 #'@keywords internal
 p_adjust <- function(df, col, suffix, method = "BH") {
 
+  if (is.na(method) || is.null(method) || method %in% c("None", "none")) {
+    return(df)
+  }
+
   if (inherits(df, "data.frame")) {
 
   col_adj <- paste0(col, "_adj")
@@ -515,111 +509,6 @@ get_any_na <- function(df, col) {
   })
 }
 
-#' @title Evaluate sites.
-#' @description Correct under-reporting probabilities by the expected number of
-#'   false positives (fp). This has been deprecated in favor of more conventional
-#'   methods available via \code{\link[stats]{p.adjust}}.
-#' @param df_sim_sites dataframe generated by [sim_sites()][sim_sites()]
-#' @param r_sim_sites integer, number of repeats for bootstrap resampling for site
-#'   simulation, needed for zero probability correction for fp calculation, Default: 1000
-#' @return dataframe with the following columns:
-#' \describe{
-#'   \item{**study_id**}{study identification}
-#'   \item{**site_number**}{site identification}
-#'   \item{**visit_med75**}{median(max(visit)) * 0.75}
-#'   \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
-#'   \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
-#'   \item{**pval**}{p-value as returned by `poisson.test`}
-#'   \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower}
-#'   \item{**n_site**}{number of study sites}
-#'   \item{**pval_n_detected**}{sites with the same p-value or lower}
-#'   \item{**pval_fp**}{expected number of fp, pval * n_site}
-#'   \item{**pval_p_vs_fp_ratio**}{odds under-reporting as p/fp, poisson.test (use as benchmark)}
-#'   \item{**pval_prob_ur**}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)}
-#'   \item{**prob_low_n_detected**}{sites with same bootstrapped probability or lower}
-#'   \item{**prob_low_fp**}{expected number of fp, prob_lower * n_site}
-#'   \item{**prob_low_p_vs_fp_ratio**}{odds under-reporting as p/fp, bootstrapped (use)}
-#'   \item{**prob_low_prob_ur**}{probability under-reporting as 1 - fp/p, bootstrapped (use)}
-#' }
-#' @details If by chance expected number of
-#'   false positives (fp) is greater than the total number of positives (p) we
-#'   set p_vs_fp_ratio = 1 and prob_ur = 0.
-#' @examples
-#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
-#'     frac_site_with_ur = 0.4, ur_rate = 0.6)
-#'
-#' df_visit$study_id <- "A"
-#' df_site <- site_aggr(df_visit)
-#'
-#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
-#'
-#' df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100)
-#' df_eval
-#' @rdname eval_sites_deprecated
-#' @seealso [site_aggr()][site_aggr()], [sim_sites()][sim_sites()]
-#' @export
-eval_sites_deprecated <- function(df_sim_sites,
-                                  r_sim_sites) {
-
-
-  df_out <- df_sim_sites
-
-  if ("pval" %in% names(df_out)) {
-
-    if (anyNA(df_out$pval)) {
-      warning("pval column contains NA")
-    }
-
-    df_out <- df_out %>%
-      group_by(.data$study_id) %>%
-      arrange(.data$study_id, .data$pval) %>%
-      mutate(
-        n_site = n(),
-        min_pval = min(.data$pval[.data$pval > 0]),
-        # we need to use ties.method max because there can be more than one site with a
-        # specific p value
-        pval_n_detected = rank(.data$pval, ties.method = "max", na.last = "keep"),
-        pval_fp = .data$pval * .data$n_site,
-        pval_fp = ifelse(.data$pval_fp == 0, .data$min_pval / 10, .data$pval_fp),
-        pval_p_vs_fp_ratio = .data$pval_n_detected / .data$pval_fp,
-        pval_p_vs_fp_ratio = ifelse(is.na(.data$pval_p_vs_fp_ratio) & ! is.na(.data$pval),
-                                    0, .data$pval_p_vs_fp_ratio),
-        # it is possible to get ratios lower than one if p < expected fp
-        # this can happen by chance and is meaningless
-        pval_p_vs_fp_ratio = ifelse(.data$pval_p_vs_fp_ratio < 1, 1, .data$pval_p_vs_fp_ratio),
-        pval_prob_ur = 1 - 1 / .data$pval_p_vs_fp_ratio
-      ) %>%
-      select(- "min_pval")
-  }
-
-  if ("prob_low" %in% names(df_out)) {
-
-    if (anyNA(df_out$pval)) {
-      warning("prob_lower column contains NA")
-    }
-
-    df_out <- df_out %>%
-      group_by(.data$study_id) %>%
-      arrange(.data$study_id, .data$prob_low) %>%
-      mutate(
-        n_site = n(),
-        # we need to use ties.method max because there can be more than one site with a
-        # specific prob_low value
-        prob_low_n_detected = rank(.data$prob_low, ties.method = "max", na.last = "keep"),
-        prob_low_fp = .data$prob_low * .data$n_site,
-        prob_low_fp = ifelse(.data$prob_low_fp == 0, 1 / r_sim_sites / 10, .data$prob_low_fp),
-        prob_low_p_vs_fp_ratio = .data$prob_low_n_detected / .data$prob_low_fp,
-        prob_low_p_vs_fp_ratio = ifelse(is.na(.data$prob_low_p_vs_fp_ratio) & ! is.na(.data$prob_low),
-                                        0, .data$prob_low_p_vs_fp_ratio),
-        # it is possible to get ratios lower than one if p < expected fp
-        # this can happen by chance and is meaningless
-        prob_low_p_vs_fp_ratio = ifelse(.data$prob_low_p_vs_fp_ratio < 1, 1, .data$prob_low_p_vs_fp_ratio),
-        prob_low_prob_ur = 1 - 1 / .data$prob_low_p_vs_fp_ratio
-      )
-  }
-
-  return(ungroup(df_out))
-}
 
 
 
diff --git a/man/eval_sites.Rd b/man/eval_sites.Rd
index 691e23c..9df472f 100644
--- a/man/eval_sites.Rd
+++ b/man/eval_sites.Rd
@@ -50,9 +50,6 @@ df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
 df_eval <- eval_sites(df_sim_sites)
 df_eval
 
-# use deprecated method  -------
-df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100)
-df_eval
 }
 \seealso{
 \code{\link[simaerep]{site_aggr}},
diff --git a/man/eval_sites_deprecated.Rd b/man/eval_sites_deprecated.Rd
deleted file mode 100644
index 1617f89..0000000
--- a/man/eval_sites_deprecated.Rd
+++ /dev/null
@@ -1,60 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/simaerep.R
-\name{eval_sites_deprecated}
-\alias{eval_sites_deprecated}
-\title{Evaluate sites.}
-\usage{
-eval_sites_deprecated(df_sim_sites, r_sim_sites)
-}
-\arguments{
-\item{df_sim_sites}{dataframe generated by \code{\link[=sim_sites]{sim_sites()}}}
-
-\item{r_sim_sites}{integer, number of repeats for bootstrap resampling for site
-simulation, needed for zero probability correction for fp calculation, Default: 1000}
-}
-\value{
-dataframe with the following columns:
-\describe{
-\item{\strong{study_id}}{study identification}
-\item{\strong{site_number}}{site identification}
-\item{\strong{visit_med75}}{median(max(visit)) * 0.75}
-\item{\strong{mean_ae_site_med75}}{mean AE at visit_med75 site level}
-\item{\strong{mean_ae_study_med75}}{mean AE at visit_med75 study level}
-\item{\strong{pval}}{p-value as returned by \code{poisson.test}}
-\item{\strong{prob_low}}{bootstrapped probability for having mean_ae_site_med75 or lower}
-\item{\strong{n_site}}{number of study sites}
-\item{\strong{pval_n_detected}}{sites with the same p-value or lower}
-\item{\strong{pval_fp}}{expected number of fp, pval * n_site}
-\item{\strong{pval_p_vs_fp_ratio}}{odds under-reporting as p/fp, poisson.test (use as benchmark)}
-\item{\strong{pval_prob_ur}}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)}
-\item{\strong{prob_low_n_detected}}{sites with same bootstrapped probability or lower}
-\item{\strong{prob_low_fp}}{expected number of fp, prob_lower * n_site}
-\item{\strong{prob_low_p_vs_fp_ratio}}{odds under-reporting as p/fp, bootstrapped (use)}
-\item{\strong{prob_low_prob_ur}}{probability under-reporting as 1 - fp/p, bootstrapped (use)}
-}
-}
-\description{
-Correct under-reporting probabilities by the expected number of
-false positives (fp). This has been deprecated in favor of more conventional
-methods available via \code{\link[stats]{p.adjust}}.
-}
-\details{
-If by chance expected number of
-false positives (fp) is greater than the total number of positives (p) we
-set p_vs_fp_ratio = 1 and prob_ur = 0.
-}
-\examples{
-df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
-    frac_site_with_ur = 0.4, ur_rate = 0.6)
-
-df_visit$study_id <- "A"
-df_site <- site_aggr(df_visit)
-
-df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
-
-df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100)
-df_eval
-}
-\seealso{
-\code{\link[=site_aggr]{site_aggr()}}, \code{\link[=sim_sites]{sim_sites()}}
-}
diff --git a/man/simaerep.Rd b/man/simaerep.Rd
index 2a69c60..2a01081 100644
--- a/man/simaerep.Rd
+++ b/man/simaerep.Rd
@@ -12,6 +12,7 @@ simaerep(
   visit_med75 = TRUE,
   inframe = FALSE,
   progress = TRUE,
+  mult_corr = TRUE,
   param_site_aggr = list(method = "med75_adj", min_pat_pool = 0.2),
   param_sim_sites = list(r = 1000, poisson_test = FALSE, prob_lower = TRUE),
   param_eval_sites = list(method = "BH"),
@@ -42,6 +43,8 @@ visit_med75. Compatible with dbplyr supported database backends.}
 
 \item{progress}{Logical, display progress bar. Default: TRUE.}
 
+\item{mult_corr}{Logical, multiplicity correction, Default: TRUE}
+
 \item{param_site_aggr}{List of parameters passed to \code{\link[=site_aggr]{site_aggr()}}. Default:
 list(method = "med75_adj", min_pat_pool = 0.2).}
 
diff --git a/tests/testthat/test_S3_simaerep.R b/tests/testthat/test_S3_simaerep.R
index 7a59881..aab1614 100644
--- a/tests/testthat/test_S3_simaerep.R
+++ b/tests/testthat/test_S3_simaerep.R
@@ -90,3 +90,21 @@ test_that("plot.simaerep throws error when original visit data cannot be retriev
     "ggplot"
   )
 })
+
+test_that("simaerep() with mult_corr = FALSE must not return adjusted probabilities", {
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+  expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, inframe = TRUE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+
+  aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE, inframe = TRUE)
+  expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
+  expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))
+
+})
diff --git a/tests/testthat/test_eval_sites.R b/tests/testthat/test_eval_sites.R
index 2d3e8d8..4f9958b 100644
--- a/tests/testthat/test_eval_sites.R
+++ b/tests/testthat/test_eval_sites.R
@@ -53,22 +53,3 @@ test_that("eval_sites() - check column names of returned data frame", {
   )
 })
 
-test_that(
-  paste(
-    "eval_sites() with method = NULL uses deprecated method for p value correction.",
-    "Must throw warning.",
-    "Stats from sim_sites() must negatively correlate with p/fp ratio.",
-    "p/fp ratio must not be smaller 1."
-  ), {
-
-  expect_warning({
-    df_eval <- eval_sites(df_sim_sites_test, method = NULL, r_sim_sites = 100)
-  })
-
-  expect_true(cor(df_eval$prob_low, df_eval$prob_low_p_vs_fp_ratio) < 0)
-  expect_true(cor(df_eval$pval, df_eval$pval_p_vs_fp_ratio) < 0)
-
-  expect_true(! any(df_eval$prob_low_p_vs_fp_ratio < 1))
-  expect_true(! any(df_eval$pval_p_vs_fp_ratio < 1))
-
-})
diff --git a/vignettes/performance.Rmd b/vignettes/performance.Rmd
index 35abc85..735b52b 100644
--- a/vignettes/performance.Rmd
+++ b/vignettes/performance.Rmd
@@ -369,7 +369,7 @@ in past experiments.
 - multiplicity correction imposes a penalty on the true positive rate
 
 > This observation was already made by the Boeringer Ingelheim Team during the evaluation of {simaerep}. We can 
-now reporducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
+now reproducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
 very realistic under-reporting probabilities.
 
 - {simaerep} outperforms simpler methods such as funnel plot and box plot outlier detection.

From ec5c545386977372e502f7faf8f2f6bcb35a6d87 Mon Sep 17 00:00:00 2001
From: Bjoern Koneswarakantha <erblast2001@gmx.de>
Date: Wed, 25 Sep 2024 16:15:10 +0200
Subject: [PATCH 2/2] fix lint

---
 tests/testthat/test_eval_sites.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/testthat/test_eval_sites.R b/tests/testthat/test_eval_sites.R
index 4f9958b..085d114 100644
--- a/tests/testthat/test_eval_sites.R
+++ b/tests/testthat/test_eval_sites.R
@@ -52,4 +52,3 @@ test_that("eval_sites() - check column names of returned data frame", {
       all()
   )
 })
-