From ee1966a0f48a735953a7c579098e30d1e20ff82a Mon Sep 17 00:00:00 2001 From: Bjoern Koneswarakantha Date: Wed, 25 Sep 2024 16:06:26 +0200 Subject: [PATCH 1/2] https://github.com/openpharma/simaerep/issues/62 and snowflake compatibility --- NAMESPACE | 1 - NEWS.md | 1 + R/S3_simaerep.R | 6 ++ R/inframe.R | 18 ++++- R/simaerep.R | 119 +----------------------------- man/eval_sites.Rd | 3 - man/eval_sites_deprecated.Rd | 60 --------------- man/simaerep.Rd | 3 + tests/testthat/test_S3_simaerep.R | 18 +++++ tests/testthat/test_eval_sites.R | 19 ----- vignettes/performance.Rmd | 2 +- 11 files changed, 50 insertions(+), 200 deletions(-) delete mode 100644 man/eval_sites_deprecated.Rd diff --git a/NAMESPACE b/NAMESPACE index c11f5e9..8912a99 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ export("%>%") export(aggr_duplicated_visits) export(check_df_visit) export(eval_sites) -export(eval_sites_deprecated) export(exp_implicit_missing_visits) export(get_config) export(get_ecd_values) diff --git a/NEWS.md b/NEWS.md index a0d3a95..f15706c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ - https://github.com/openpharma/simaerep/issues/59 - https://github.com/openpharma/simaerep/issues/16 - update vignettes +- https://github.com/openpharma/simaerep/issues/62 # simaerep 0.5.0 - allow flexible AE rates in data simulations diff --git a/R/S3_simaerep.R b/R/S3_simaerep.R index 57d6d1c..6893c2e 100644 --- a/R/S3_simaerep.R +++ b/R/S3_simaerep.R @@ -76,6 +76,7 @@ validate_simaerep <- function(x) { #' Default: TRUE. #'@param inframe Logical, only table operations to be used; does not require #' visit_med75. Compatible with dbplyr supported database backends. +#'@param mult_corr Logical, multiplicity correction, Default: TRUE #'@param param_site_aggr List of parameters passed to [site_aggr()]. Default: #' list(method = "med75_adj", min_pat_pool = 0.2). #'@param param_sim_sites List of parameters passed to [sim_sites()]. Default: @@ -133,6 +134,7 @@ simaerep <- function(df_visit, visit_med75 = TRUE, inframe = FALSE, progress = TRUE, + mult_corr = TRUE, param_site_aggr = list( method = "med75_adj", min_pat_pool = 0.2 @@ -168,6 +170,10 @@ simaerep <- function(df_visit, param_sim_sites$under_only <- under_only param_eval_sites$under_only <- under_only + if (! mult_corr) { + param_eval_sites$method <- NA + } + if (visit_med75 && ! inframe) { if (r != param_sim_sites$r) { diff --git a/R/inframe.R b/R/inframe.R index 0778505..a5a75d9 100644 --- a/R/inframe.R +++ b/R/inframe.R @@ -137,7 +137,23 @@ sim_inframe <- function(df_visit, r = 1000, df_site = NULL) { mutate( # we generate a random number between 1 and the maximum number # of eligible patients - rnd = runif(n()), + rnd = runif(n()) + ) + + if (inherits(df_sim_prep, "tbl_Snowflake")) { + # snowflake RANDOM() works differently than other backends and returns large + # positive and negative integers. We normalize by using the min and max values + # of those 64bit integers. Source for integer values is ChatGPT. + # for a range from -5 to 5: 4 -> 0.0 -4 -> 0.1, with underlying formula + + df_sim_prep <- df_sim_prep %>% + mutate( + rnd = (.data$rnd - (-9223372036854775808)) / (9223372036854775807 - (-9223372036854775808)) + ) + } + + df_sim_prep <- df_sim_prep %>% + mutate( # transform to values between 1 and max number of patients rnd_rwn = floor(.data$rnd * .data$max_rwn) + 1 ) diff --git a/R/simaerep.R b/R/simaerep.R index e0d12af..e77f76a 100644 --- a/R/simaerep.R +++ b/R/simaerep.R @@ -380,9 +380,6 @@ get_visit_med75 <- function(df_pat, #' df_eval <- eval_sites(df_sim_sites) #' df_eval #' -#' # use deprecated method ------- -#' df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100) -#' df_eval #' @rdname eval_sites #' @seealso \code{\link[simaerep]{site_aggr}}, #' \code{\link[simaerep]{sim_sites}}, @@ -393,13 +390,6 @@ eval_sites <- function(df_sim_sites, under_only = TRUE, ...) { - - if (is.null(method)) { - warning("using deprecated method for probability adjustment") - df_out <- eval_sites_deprecated(df_sim_sites, ...) - return(df_out) - } - df_out <- df_sim_sites if ("pval" %in% colnames(df_out)) { @@ -458,6 +448,10 @@ eval_sites <- function(df_sim_sites, #'@keywords internal p_adjust <- function(df, col, suffix, method = "BH") { + if (is.na(method) || is.null(method) || method %in% c("None", "none")) { + return(df) + } + if (inherits(df, "data.frame")) { col_adj <- paste0(col, "_adj") @@ -515,111 +509,6 @@ get_any_na <- function(df, col) { }) } -#' @title Evaluate sites. -#' @description Correct under-reporting probabilities by the expected number of -#' false positives (fp). This has been deprecated in favor of more conventional -#' methods available via \code{\link[stats]{p.adjust}}. -#' @param df_sim_sites dataframe generated by [sim_sites()][sim_sites()] -#' @param r_sim_sites integer, number of repeats for bootstrap resampling for site -#' simulation, needed for zero probability correction for fp calculation, Default: 1000 -#' @return dataframe with the following columns: -#' \describe{ -#' \item{**study_id**}{study identification} -#' \item{**site_number**}{site identification} -#' \item{**visit_med75**}{median(max(visit)) * 0.75} -#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level} -#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level} -#' \item{**pval**}{p-value as returned by `poisson.test`} -#' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower} -#' \item{**n_site**}{number of study sites} -#' \item{**pval_n_detected**}{sites with the same p-value or lower} -#' \item{**pval_fp**}{expected number of fp, pval * n_site} -#' \item{**pval_p_vs_fp_ratio**}{odds under-reporting as p/fp, poisson.test (use as benchmark)} -#' \item{**pval_prob_ur**}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)} -#' \item{**prob_low_n_detected**}{sites with same bootstrapped probability or lower} -#' \item{**prob_low_fp**}{expected number of fp, prob_lower * n_site} -#' \item{**prob_low_p_vs_fp_ratio**}{odds under-reporting as p/fp, bootstrapped (use)} -#' \item{**prob_low_prob_ur**}{probability under-reporting as 1 - fp/p, bootstrapped (use)} -#' } -#' @details If by chance expected number of -#' false positives (fp) is greater than the total number of positives (p) we -#' set p_vs_fp_ratio = 1 and prob_ur = 0. -#' @examples -#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5, -#' frac_site_with_ur = 0.4, ur_rate = 0.6) -#' -#' df_visit$study_id <- "A" -#' df_site <- site_aggr(df_visit) -#' -#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100) -#' -#' df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100) -#' df_eval -#' @rdname eval_sites_deprecated -#' @seealso [site_aggr()][site_aggr()], [sim_sites()][sim_sites()] -#' @export -eval_sites_deprecated <- function(df_sim_sites, - r_sim_sites) { - - - df_out <- df_sim_sites - - if ("pval" %in% names(df_out)) { - - if (anyNA(df_out$pval)) { - warning("pval column contains NA") - } - - df_out <- df_out %>% - group_by(.data$study_id) %>% - arrange(.data$study_id, .data$pval) %>% - mutate( - n_site = n(), - min_pval = min(.data$pval[.data$pval > 0]), - # we need to use ties.method max because there can be more than one site with a - # specific p value - pval_n_detected = rank(.data$pval, ties.method = "max", na.last = "keep"), - pval_fp = .data$pval * .data$n_site, - pval_fp = ifelse(.data$pval_fp == 0, .data$min_pval / 10, .data$pval_fp), - pval_p_vs_fp_ratio = .data$pval_n_detected / .data$pval_fp, - pval_p_vs_fp_ratio = ifelse(is.na(.data$pval_p_vs_fp_ratio) & ! is.na(.data$pval), - 0, .data$pval_p_vs_fp_ratio), - # it is possible to get ratios lower than one if p < expected fp - # this can happen by chance and is meaningless - pval_p_vs_fp_ratio = ifelse(.data$pval_p_vs_fp_ratio < 1, 1, .data$pval_p_vs_fp_ratio), - pval_prob_ur = 1 - 1 / .data$pval_p_vs_fp_ratio - ) %>% - select(- "min_pval") - } - - if ("prob_low" %in% names(df_out)) { - - if (anyNA(df_out$pval)) { - warning("prob_lower column contains NA") - } - - df_out <- df_out %>% - group_by(.data$study_id) %>% - arrange(.data$study_id, .data$prob_low) %>% - mutate( - n_site = n(), - # we need to use ties.method max because there can be more than one site with a - # specific prob_low value - prob_low_n_detected = rank(.data$prob_low, ties.method = "max", na.last = "keep"), - prob_low_fp = .data$prob_low * .data$n_site, - prob_low_fp = ifelse(.data$prob_low_fp == 0, 1 / r_sim_sites / 10, .data$prob_low_fp), - prob_low_p_vs_fp_ratio = .data$prob_low_n_detected / .data$prob_low_fp, - prob_low_p_vs_fp_ratio = ifelse(is.na(.data$prob_low_p_vs_fp_ratio) & ! is.na(.data$prob_low), - 0, .data$prob_low_p_vs_fp_ratio), - # it is possible to get ratios lower than one if p < expected fp - # this can happen by chance and is meaningless - prob_low_p_vs_fp_ratio = ifelse(.data$prob_low_p_vs_fp_ratio < 1, 1, .data$prob_low_p_vs_fp_ratio), - prob_low_prob_ur = 1 - 1 / .data$prob_low_p_vs_fp_ratio - ) - } - - return(ungroup(df_out)) -} diff --git a/man/eval_sites.Rd b/man/eval_sites.Rd index 691e23c..9df472f 100644 --- a/man/eval_sites.Rd +++ b/man/eval_sites.Rd @@ -50,9 +50,6 @@ df_sim_sites <- sim_sites(df_site, df_visit, r = 100) df_eval <- eval_sites(df_sim_sites) df_eval -# use deprecated method ------- -df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100) -df_eval } \seealso{ \code{\link[simaerep]{site_aggr}}, diff --git a/man/eval_sites_deprecated.Rd b/man/eval_sites_deprecated.Rd deleted file mode 100644 index 1617f89..0000000 --- a/man/eval_sites_deprecated.Rd +++ /dev/null @@ -1,60 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simaerep.R -\name{eval_sites_deprecated} -\alias{eval_sites_deprecated} -\title{Evaluate sites.} -\usage{ -eval_sites_deprecated(df_sim_sites, r_sim_sites) -} -\arguments{ -\item{df_sim_sites}{dataframe generated by \code{\link[=sim_sites]{sim_sites()}}} - -\item{r_sim_sites}{integer, number of repeats for bootstrap resampling for site -simulation, needed for zero probability correction for fp calculation, Default: 1000} -} -\value{ -dataframe with the following columns: -\describe{ -\item{\strong{study_id}}{study identification} -\item{\strong{site_number}}{site identification} -\item{\strong{visit_med75}}{median(max(visit)) * 0.75} -\item{\strong{mean_ae_site_med75}}{mean AE at visit_med75 site level} -\item{\strong{mean_ae_study_med75}}{mean AE at visit_med75 study level} -\item{\strong{pval}}{p-value as returned by \code{poisson.test}} -\item{\strong{prob_low}}{bootstrapped probability for having mean_ae_site_med75 or lower} -\item{\strong{n_site}}{number of study sites} -\item{\strong{pval_n_detected}}{sites with the same p-value or lower} -\item{\strong{pval_fp}}{expected number of fp, pval * n_site} -\item{\strong{pval_p_vs_fp_ratio}}{odds under-reporting as p/fp, poisson.test (use as benchmark)} -\item{\strong{pval_prob_ur}}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)} -\item{\strong{prob_low_n_detected}}{sites with same bootstrapped probability or lower} -\item{\strong{prob_low_fp}}{expected number of fp, prob_lower * n_site} -\item{\strong{prob_low_p_vs_fp_ratio}}{odds under-reporting as p/fp, bootstrapped (use)} -\item{\strong{prob_low_prob_ur}}{probability under-reporting as 1 - fp/p, bootstrapped (use)} -} -} -\description{ -Correct under-reporting probabilities by the expected number of -false positives (fp). This has been deprecated in favor of more conventional -methods available via \code{\link[stats]{p.adjust}}. -} -\details{ -If by chance expected number of -false positives (fp) is greater than the total number of positives (p) we -set p_vs_fp_ratio = 1 and prob_ur = 0. -} -\examples{ -df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5, - frac_site_with_ur = 0.4, ur_rate = 0.6) - -df_visit$study_id <- "A" -df_site <- site_aggr(df_visit) - -df_sim_sites <- sim_sites(df_site, df_visit, r = 100) - -df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100) -df_eval -} -\seealso{ -\code{\link[=site_aggr]{site_aggr()}}, \code{\link[=sim_sites]{sim_sites()}} -} diff --git a/man/simaerep.Rd b/man/simaerep.Rd index 2a69c60..2a01081 100644 --- a/man/simaerep.Rd +++ b/man/simaerep.Rd @@ -12,6 +12,7 @@ simaerep( visit_med75 = TRUE, inframe = FALSE, progress = TRUE, + mult_corr = TRUE, param_site_aggr = list(method = "med75_adj", min_pat_pool = 0.2), param_sim_sites = list(r = 1000, poisson_test = FALSE, prob_lower = TRUE), param_eval_sites = list(method = "BH"), @@ -42,6 +43,8 @@ visit_med75. Compatible with dbplyr supported database backends.} \item{progress}{Logical, display progress bar. Default: TRUE.} +\item{mult_corr}{Logical, multiplicity correction, Default: TRUE} + \item{param_site_aggr}{List of parameters passed to \code{\link[=site_aggr]{site_aggr()}}. Default: list(method = "med75_adj", min_pat_pool = 0.2).} diff --git a/tests/testthat/test_S3_simaerep.R b/tests/testthat/test_S3_simaerep.R index 7a59881..aab1614 100644 --- a/tests/testthat/test_S3_simaerep.R +++ b/tests/testthat/test_S3_simaerep.R @@ -90,3 +90,21 @@ test_that("plot.simaerep throws error when original visit data cannot be retriev "ggplot" ) }) + +test_that("simaerep() with mult_corr = FALSE must not return adjusted probabilities", { + + aerep <- simaerep(df_visit_test, mult_corr = FALSE) + expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval)) + + aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE) + expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval)) + expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval)) + + aerep <- simaerep(df_visit_test, mult_corr = FALSE, inframe = TRUE) + expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval)) + + aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE, inframe = TRUE) + expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval)) + expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval)) + +}) diff --git a/tests/testthat/test_eval_sites.R b/tests/testthat/test_eval_sites.R index 2d3e8d8..4f9958b 100644 --- a/tests/testthat/test_eval_sites.R +++ b/tests/testthat/test_eval_sites.R @@ -53,22 +53,3 @@ test_that("eval_sites() - check column names of returned data frame", { ) }) -test_that( - paste( - "eval_sites() with method = NULL uses deprecated method for p value correction.", - "Must throw warning.", - "Stats from sim_sites() must negatively correlate with p/fp ratio.", - "p/fp ratio must not be smaller 1." - ), { - - expect_warning({ - df_eval <- eval_sites(df_sim_sites_test, method = NULL, r_sim_sites = 100) - }) - - expect_true(cor(df_eval$prob_low, df_eval$prob_low_p_vs_fp_ratio) < 0) - expect_true(cor(df_eval$pval, df_eval$pval_p_vs_fp_ratio) < 0) - - expect_true(! any(df_eval$prob_low_p_vs_fp_ratio < 1)) - expect_true(! any(df_eval$pval_p_vs_fp_ratio < 1)) - -}) diff --git a/vignettes/performance.Rmd b/vignettes/performance.Rmd index 35abc85..735b52b 100644 --- a/vignettes/performance.Rmd +++ b/vignettes/performance.Rmd @@ -369,7 +369,7 @@ in past experiments. - multiplicity correction imposes a penalty on the true positive rate > This observation was already made by the Boeringer Ingelheim Team during the evaluation of {simaerep}. We can -now reporducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides +now reproducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides very realistic under-reporting probabilities. - {simaerep} outperforms simpler methods such as funnel plot and box plot outlier detection. From ec5c545386977372e502f7faf8f2f6bcb35a6d87 Mon Sep 17 00:00:00 2001 From: Bjoern Koneswarakantha Date: Wed, 25 Sep 2024 16:15:10 +0200 Subject: [PATCH 2/2] fix lint --- tests/testthat/test_eval_sites.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test_eval_sites.R b/tests/testthat/test_eval_sites.R index 4f9958b..085d114 100644 --- a/tests/testthat/test_eval_sites.R +++ b/tests/testthat/test_eval_sites.R @@ -52,4 +52,3 @@ test_that("eval_sites() - check column names of returned data frame", { all() ) }) -